In [1]:
#!/usr/bin/python

import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt

import sqlite3

from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import chi2_kernel

import db_wrapper

def build_dataset(db):
    films = db.get_cursor().execute("SELECT DISTINCT(film_link) FROM films").fetchall()
    lists = db.get_cursor().execute("SELECT DISTINCT(list_link) FROM lists").fetchall()

    lists_flat = [item[0] for item in lists]
    films_flat = [item[0] for item in films]

    zeroes = np.zeros(shape=(len(films_flat), len(lists_flat)))
    df = pd.DataFrame(zeroes, index=films_flat, columns=lists_flat)

#    print df.index
#    print df.index[1]
#    print df.ix['/movies/12+angry+men/']

    for item in db.get_cursor().execute("SELECT * FROM list_items").fetchall():
        df[item[1]][item[2]] = 1

    return df

db = db_wrapper.db_wrapper(name="icheckmovies.db")
df = build_dataset(db)

In [2]:
count = 0
connections = {}

for i in range(0, len(df)):
    connections[df.index[i]] = {}
    for j in range(0, len(df)):
        if j >= i:
            continue
        count = count + 1
        #connections[df.index[i]][j] = 0
        #for i in range(0, len(df.columns)):
            
print count

100231561


In [None]:
count = 0
connections = {}

for i in range(0, len(df.columns)):
    connections[df.columns[i]] = {}
    for j in range(i + 1, len(df.columns)):
        for k in range(0, len(df)):
            if df.iloc[k][i] == 1 and df.iloc[k][j] == 1:
                count = count + 1
        #connections[df.index[i]][j] = 0
        #for i in range(0, len(df.columns)):
            
print count

In [4]:
X = StandardScaler().fit_transform(df)

In [5]:
X.shape

(14159L, 174L)

In [8]:
db = DBSCAN(eps=0.3, min_samples=10).fit(X)

In [10]:
import matplotlib.pyplot as plt

core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True

# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
for k, col in zip(unique_labels, colors):
    if k == -1:
        # Black used for noise.
        col = 'k'

    class_member_mask = (labels == k)

    xy = X[class_member_mask & core_samples_mask]
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col,
             markeredgecolor='k', markersize=14)

    xy = X[class_member_mask & ~core_samples_mask]
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col,
             markeredgecolor='k', markersize=6)

plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()

In [11]:
from sklearn.cluster import KMeans

plt.figure(figsize=(12, 12))

random_state = 1700

clusterer = KMeans(n_clusters=10, random_state=random_state)
y_pred = clusterer.fit_predict(X)
y_pred

unique, counts = np.unique(y_pred, return_counts=True)

print np.asarray((unique, counts)).T

[[    0   113]
 [    1   744]
 [    2   748]
 [    3   116]
 [    4 11129]
 [    5   305]
 [    6    33]
 [    7   460]
 [    8    83]
 [    9   428]]


In [127]:
import numpy.linalg
np.linalg.matrix_rank(df.T)
len(df.drop_duplicates())


4144

In [34]:
popular = df[df.sum(axis=1) > 10]
print len(popular)
print np.linalg.matrix_rank(popular) # 157 of the 174 lists are represented

K2 = chi2_kernel(popular)

best = 0
best_i = 0
best_j = 0
exact = 0
for i in range(len(K2)):
    for j in range(i + 1, len(K2)):
        if K2[i][j] == 1:
            exact = exact + 1
        if K2[i][j] > best:
            best = K2[i][j]
            best_i = i
            best_j = j

586
157


In [39]:
print best_i
print best_j
print exact
print np.amax(K2)
print np.amin(K2)

483
0
1
1.0
5.24288566336e-22


In [17]:
popular.index[317]

u'/movies/olympia+2.+teil+-+fest+der+schonheit/'

In [42]:
#item = popular.index.tolist().index(u'/movies/alien/')
item = popular.index.tolist().index(u'/movies/fight+club/')
item = popular.index.tolist().index(u'/movies/the+green+mile/')

print item

best = 0
best_i = 0
best_j = 0
print len(K2[item])
for i in range(len(K2[item])):
    if i == item:
        continue
    if K2[item][i] > best:
        best = K2[item][i]
        best_i = i

print best, best_i, popular.index[best_i], popular.index[item]

460
586
0.00247875217667 34 /movies/american+history+x/ /movies/the+green+mile/


In [19]:
from sklearn.cluster import KMeans

plt.figure(figsize=(12, 12))

random_state = 1700

X = StandardScaler().fit_transform(popular)

clusterer = KMeans(n_clusters=10, random_state=random_state)
y_pred = clusterer.fit_predict(X)
#print y_pred

unique, counts = np.unique(y_pred, return_counts=True)

groups = {}
for item in unique:
    groups[item] = popular.index[y_pred == item]
    
print groups
print np.asarray((unique, counts)).T

{0: Index([                                                                          u'/movies/a+hard+days+night/',
                                                                        u'/movies/a+matter+of+life+and+death/',
                                                                                   u'/movies/black+narcissus/',
                                                                                            u'/movies/blowup/',
                                                                                            u'/movies/brazil/',
                                                                                   u'/movies/brief+encounter/',
                                                                                     u'/movies/dont+look+now/',
                                                                                u'/movies/great+expectations/',
                                                                                            u'/movie

In [20]:
from sklearn.cluster import KMeans

plt.figure(figsize=(12, 12))

random_state = 1700

X2 = StandardScaler().fit_transform(df.T)

clusterer = KMeans(n_clusters=10, random_state=random_state)
y_pred = clusterer.fit_predict(X2)
y_pred

unique, counts = np.unique(y_pred, return_counts=True)

print np.asarray((unique, counts)).T

db = DBSCAN(eps=0.3, min_samples=10).fit(X2)
labels = db.labels_
labels.shape
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_clusters_
print('Estimated number of clusters: %d' % n_clusters_)

[[  0   2]
 [  1   1]
 [  2   1]
 [  3   1]
 [  4 164]
 [  5   1]
 [  6   1]
 [  7   1]
 [  8   1]
 [  9   1]]
Estimated number of clusters: 0


In [27]:
by_list = df.T

counts = np.empty([len(by_list), len(by_list)])

results = pd.DataFrame(columns=['list 1', 'list 2', 'phi', 'abs_phi'])

import scipy

for i in range(0, len(by_list)):
    for j in range(i + 1, len(by_list)):
        corr = scipy.stats.stats.pearsonr(by_list.iloc[i], by_list.iloc[j])[0]
        results.loc[len(results) + 1] = [by_list.index[i], by_list.index[j], corr, abs(corr)]

In [31]:
results.sort(['phi'], ascending=1)[:20]

Unnamed: 0,list 1,list 2,phi,abs_phi
10568,/lists/doubling+the+canon/,/lists/they+shoot+pictures+dont+they/,-0.086335,0.086335
14669,/lists/the+21st+centurys+most+acclaimed+films/,/lists/unescos+memory+of+the+world+national+ci...,-0.068828,0.068828
7081,/lists/all-time+worldwide+box+office/,/lists/doubling+the+canon/,-0.065431,0.065431
13748,/lists/national+film+registry/,/lists/the+21st+centurys+most+acclaimed+films/,-0.060458,0.060458
14911,/lists/they+shoot+zombies+dont+theyquestion/,/lists/unescos+memory+of+the+world+national+ci...,-0.058229,0.058229
7397,/lists/amos+vogels+film+as+a+subversive+art/,/lists/the+21st+centurys+most+acclaimed+films/,-0.057819,0.057819
13370,/lists/more+noirs+from+tspdt/,/lists/the+21st+centurys+most+acclaimed+films/,-0.057163,0.057163
7979,/lists/asian+cinema+a+field+guide/,/lists/national+film+registry/,-0.056092,0.056092
7113,/lists/all-time+worldwide+box+office/,/lists/jonathan+rosenbaums+essential+films/,-0.055049,0.055049
7972,/lists/asian+cinema+a+field+guide/,/lists/more+noirs+from+tspdt/,-0.053799,0.053799
