In [1]:
import pandas as pd
import os
import numpy as np

path = '../t-sne'
os.chdir(path)

movies = pd.read_csv('tmdb_movies_data.csv', encoding = 'utf8')
movies.dropna(subset=['overview'], inplace=True)
movies.apply(lambda x: pd.api.types.infer_dtype(x.values))


genres = movies['genres'].str.split('|',expand=True)
genres = genres.values
genres = pd.DataFrame(genres)
genres.columns = ['Genre 1', 'Genre 2', 'Genre 3', 'Genre 4', 'Genre 5']
genres = genres.applymap(lambda x: '' if x is None else str(x))

# Only 1 genre
movies = movies.iloc[(genres.apply(lambda x: sum(x != ''), axis = 1) == 1).values]
movies = movies.rename(columns = {'genres':'Genre 1', 'overview':'Description'})

# Get rid of Mr. and Mrs. split
movies['Description'] = movies['Description'].str.replace('Mrs\\.', 'Mrs')
movies['Description'] = movies['Description'].str.replace('Mr\\.', 'Mr')

# More than 1 sentence
movies = movies.iloc[((~movies['Description'].str.split('\\. ',expand=True).isnull()).apply(sum, axis = 1) > 3).values]

# Both genre and description not null
movies = movies.iloc[((movies[['Genre 1', 'Description']].isnull()).apply(sum, axis = 1) == 0).values]

# Genre share at least 10%
movies = movies.set_index('Genre 1').join(pd.DataFrame(movies.groupby('Genre 1').size()/len(movies) > 0.1)).rename(columns = {0:'to_drop'}).reset_index()

movies = movies.iloc[movies['to_drop'].values]
movies = movies.drop(['to_drop'], axis = 1)

# Splitting by sentence
sentences = pd.DataFrame(movies['Description'].str.split('\\. ',expand=True).unstack()).reset_index().sort_values(['level_1', 'level_0'])
sentences = sentences[sentences[0].apply(lambda x: x is not None)]
sentences = sentences.set_index('level_1').drop('level_0', axis = 1).rename(columns = {0:'overview'})

  values, self.f, axis=self.axis, dummy=dummy, labels=labels


In [2]:
# Splitting by sentence
sentences = pd.DataFrame(movies['Description'].str.split('\\. ',expand=True).unstack()).reset_index().sort_values(['level_1', 'level_0'])
sentences = sentences[sentences[0].apply(lambda x: x is not None)]
sentences = sentences.set_index('level_1').drop('level_0', axis = 1).rename(columns = {0:'overview'})
sentences

Unnamed: 0_level_0,overview
level_1,Unnamed: 1_level_1
23,"After the Ball, a retail fairy tale set in the..."
23,Kate's dream is to design for couturier houses
23,"Although she is a bright new talent, Kate can'..."
23,"No one trusts the daughter of Lee Kassell, a r..."
23,Who wants a spy among the sequins and stiletto...
23,But with the help of a prince of a guy in the ...
24,This material was developed and prepared over ...
24,This special kind of goes back to when he used...
24,It felt right to him to shoot this special in ...
24,The show is about an hour long


In [3]:
def mj_dtm(description):
    from sklearn.feature_extraction.text import CountVectorizer
    import string
    from nltk.corpus import stopwords
    from nltk.stem.lancaster import LancasterStemmer

    stop = stopwords.words('english')

    docs = description

    docs = docs.apply(lambda x: x.translate(str.maketrans({key: None for key in string.punctuation})))
    docs = docs.apply(lambda x: x.lower())

    docs = docs.apply(lambda x: [x for x in x.split() if x not in stop])
    docs = docs.apply(lambda x: ' '.join(x))

    lancaster = LancasterStemmer()

    from nltk.stem import WordNetLemmatizer
    wordnet_lemmatizer = WordNetLemmatizer()

    # docs = [lancaster.stem(word) for word in docs]

    docs = [wordnet_lemmatizer.lemmatize(word) for word in docs]

    vec = CountVectorizer()

    X = vec.fit_transform(docs)
    df = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())

    # Words occurred in more than 1 movie
    df = df.iloc[:,((df>0).apply(sum) > 1).values]
    import re
    df = df.iloc[:, pd.Series(df.columns).apply(lambda x: re.match("^[0-9]", x) is None).values]
    return df

In [4]:
df = mj_dtm(movies['Description'])

In [5]:
df_test = mj_dtm(sentences['overview'])
df_test = df_test.set_index(sentences.index.values)

In [6]:
dtm = movies['Genre 1'].reset_index()[movies['Genre 1'].reset_index().columns.difference(['index'])]
groups_filter = pd.concat([dtm, df], axis=1)
groups_filter = groups_filter.groupby('Genre 1').sum()[df.columns]
groups_filter = groups_filter.T
groups_filter['n_words'] = groups_filter.apply(sum, axis = 1)

# Taking into consideration n_words which is always > 0
groups_filter['is_common'] = groups_filter.apply(lambda x: 1 if sum(x > 0) > 2 else 0, axis = 1)

groups_filter.sort_values(['is_common', 'n_words'], ascending = False)

# Get rid of top 20% words which are common (?)
groups_filter = groups_filter.sort_values(['is_common', 'n_words'], ascending = False).iloc[int(np.floor(groups_filter.shape[0]/5)):groups_filter.shape[0]]

word_list = list(groups_filter.index)
word_list

['broken',
 'building',
 'buried',
 'chance',
 'changed',
 'childhood',
 'chronicles',
 'community',
 'computer',
 'craig',
 'creative',
 'daily',
 'darkness',
 'desire',
 'detective',
 'determined',
 'dont',
 'drugs',
 'du',
 'economic',
 'else',
 'entire',
 'familys',
 'far',
 'fashion',
 'fast',
 'feelings',
 'finding',
 'francis',
 'french',
 'front',
 'george',
 'got',
 'grown',
 'half',
 'happy',
 'hero',
 'hospital',
 'hot',
 'hour',
 'ice',
 'incredible',
 'india',
 'intimate',
 'involved',
 'jane',
 'joins',
 'joy',
 'lead',
 'legend',
 'los',
 'loves',
 'major',
 'martial',
 'mary',
 'match',
 'matt',
 'megan',
 'middle',
 'missing',
 'modern',
 'mom',
 'moment',
 'moon',
 'nazi',
 'near',
 'nobody',
 'oscar',
 'outside',
 'owner',
 'part',
 'photos',
 'pictures',
 'plays',
 'popular',
 'power',
 'private',
 'professional',
 'quite',
 'radio',
 'release',
 'returning',
 'revolution',
 'rose',
 'safe',
 'scott',
 'sea',
 'secrets',
 'sees',
 'shocking',
 'single',
 'slowly',
 

In [7]:
groups_filter.iloc[:, 0:groups_filter.shape[1]-2]
groups_filter = groups_filter.iloc[:, 0:groups_filter.shape[1]-2]
n_filter = (groups_filter > 0).sum()
n_filter

Genre 1
Comedy         1213
Documentary     992
Drama          1444
Horror          706
dtype: int64

In [8]:
groups_filter = groups_filter.T

def sorted(s, num):
    tmp = s.sort_values(ascending=False)[:num]  # earlier s.order(..)
#     tmp.index = range(num)
    return tmp

groups_filter = groups_filter[word_list]
groups_filter
sorted_words = groups_filter.T.apply(lambda x: sorted(x, 200))
sorted_words

Genre 1,Comedy,Documentary,Drama,Horror
abandoned,,,,6.0
accepts,2.0,,,
accidentally,,,,2.0
accounts,,2.0,,
actors,,3.0,,
actress,,,4.0,
actually,2.0,,,
adams,,,4.0,
adulthood,,2.0,,
adults,3.0,,,


In [9]:
# # Each row now has 4 intersections with all 4 genres
# df_array = df_test[list(sorted_words.index)].values
# tfm = [row * (~sorted_words.isnull().T) for row in df_array]

In [10]:
comedy_base = ~sorted_words['Comedy'][~sorted_words.isnull()['Comedy'].T].isnull()
horror_base = ~sorted_words['Horror'][~sorted_words.isnull()['Horror'].T].isnull()
documentary_base = ~sorted_words['Documentary'][~sorted_words.isnull()['Documentary'].T].isnull()
drama_base = ~sorted_words['Drama'][~sorted_words.isnull()['Drama'].T].isnull()

In [11]:
comedy_test = df_test[comedy_base.index.values]
horror_test = df_test[horror_base.index.values]
documentary_test = df_test[documentary_base.index.values]
drama_test = df_test[drama_base.index.values]

### VRC

In [12]:
import scipy as sp
from rpy2.robjects.packages import importr
import rpy2.robjects as ro
from rpy2.robjects import r
# import pandas.rpy.common as com

# import R's "base" package
base = importr('base')

# import R's "utils" package
utils = importr('utils')

# import rpy2's package module
import rpy2.robjects.packages as rpackages

# import R's utility package
utils = rpackages.importr('utils')

# select a mirror for R packages
utils.chooseCRANmirror(ind=1) # select the first mirror in the list

# R package names
packnames = ('TDA')

# R vector of strings
from rpy2.robjects.vectors import StrVector

utils.install_packages(StrVector('TDA'))
r('library(TDA)')

0,1,2,3,4,5,6
'TDA','tools','stats',...,'datasets','methods','base'


In [13]:
import rpy2
from rpy2.robjects import pandas2ri # install any dependency package if you get error like "module not found"
from rpy2.robjects.conversion import localconverter

def r_convert(x):
    pandas2ri.activate()

    with localconverter(ro.default_converter + pandas2ri.converter):
        r_df = ro.conversion.py2rpy(x)

    from rpy2.robjects import globalenv
    globalenv['r_df'] = r_df
    r('Diag <- ripsDiag(X = r_df, 1, max(r_df), library = "GUDHI", printProgress = FALSE)')
    return(r('Diag$diagram'))

In [14]:
comedy_rips = comedy_test.groupby(comedy_test.index).apply(lambda x: r_convert(x))
horror_rips = horror_test.groupby(horror_test.index).apply(lambda x: r_convert(x))
documentary_rips = documentary_test.groupby(documentary_test.index).apply(lambda x: r_convert(x))
drama_rips = drama_test.groupby(drama_test.index).apply(lambda x: r_convert(x))

In [15]:
comedy_rips

23     [[0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, ...
24     [[0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, ...
25                    [[0.0, 0.0, 1.0], [0.0, 0.0, 1.0]]
26     [[0.0, 0.0, 2.0], [0.0, 0.0, 2.0], [0.0, 0.0, ...
27     [[0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, ...
                             ...                        
409                                    [[0.0, 0.0, 0.0]]
410                   [[0.0, 0.0, 1.0], [0.0, 0.0, 1.0]]
411                   [[0.0, 0.0, 1.0], [0.0, 0.0, 1.0]]
412                   [[0.0, 0.0, 1.0], [0.0, 0.0, 1.0]]
413                                    [[0.0, 0.0, 0.0]]
Length: 383, dtype: object

In [33]:
comedy_rips[comedy_rips.apply(lambda x: np.any(x[:, 0] == 1)).values]

Series([], dtype: object)

In [23]:
type(comedy_rips.iloc[0])

numpy.ndarray

In [272]:
r('dev.new()') # optional: create a new figure
r('plot(Diag$diagram, barcode=TRUE)')
r('dev.off()')

R[write to console]: dev.new(): using pdf(file="Rplots1.pdf")



array([1], dtype=int32)

In [368]:
np.arccos(np.round(cosine_similarity(comedy_test.loc[46]), 5))

array([[0.        , 1.57079633, 1.57079633, 1.57079633, 1.57079633],
       [1.57079633, 0.        , 0.95531695, 1.57079633, 1.57079633],
       [1.57079633, 0.95531695, 0.        , 1.57079633, 1.23096295],
       [1.57079633, 1.57079633, 1.57079633, 0.        , 1.57079633],
       [1.57079633, 1.57079633, 1.23096295, 1.57079633, 0.        ]])

In [358]:
from sklearn.metrics.pairwise import cosine_similarity
np.arccos(cosine_similarity(comedy_test.loc[46]))

  


array([[2.10734243e-08, 1.57079633e+00, 1.57079633e+00, 1.57079633e+00,
        1.57079633e+00],
       [1.57079633e+00, 0.00000000e+00, 9.55316618e-01, 1.57079633e+00,
        1.57079633e+00],
       [1.57079633e+00, 9.55316618e-01,            nan, 1.57079633e+00,
        1.23095942e+00],
       [1.57079633e+00, 1.57079633e+00, 1.57079633e+00, 0.00000000e+00,
        1.57079633e+00],
       [1.57079633e+00, 1.57079633e+00, 1.23095942e+00, 1.57079633e+00,
                   nan]])

In [335]:
comedy_test.loc[46].sum().sum()

7

In [337]:
documentary_test.loc[46].sum().sum()

2

In [338]:
drama_test.loc[46].sum().sum()

1

In [336]:
horror_test.loc[46].sum().sum()

0

In [None]:
# r('dev.off()')
r('dev.new()') # optional: create a new figure
r('plot(diag.info$diagram, barcode=TRUE)')
r('dev.off()')

In [None]:

df = df.iloc[:, (groups_filter.apply(lambda x: sum(x != 0)) == 1).values]

from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(smooth_idf=False)
tfidf = transformer.fit_transform(df)
tfidf = pd.DataFrame(tfidf.toarray())
tfidf.columns = df.columns.difference(['Genre 1'])
tfidf

tfidf = tfidf.reset_index()[tfidf.reset_index().columns.difference(['index'])]
first_class = pd.concat([dtm, tfidf], axis=1)
first_class