### Import & Data Preprocessing

In [1]:
import pandas as pd
import os
import numpy as np
import math

path = '../tda'
os.chdir(path)

movies = pd.read_csv('AllMoviesDetailsCleaned.csv', encoding = 'utf8', sep = ";")
movies.dropna(subset=['overview'], inplace=True)
movies.apply(lambda x: pd.api.types.infer_dtype(x.values))


genres = movies['genres'].str.split('|',expand=True)
genres = genres.values
genres = pd.DataFrame(genres)
genres.columns = ['Genre 1', 'Genre 2', 'Genre 3', 'Genre 4', 'Genre 5', 'Genre 6', 'Genre 7', 'Genre 8', 'Genre 9', 'Genre 10', 'Genre 11']
genres = genres.applymap(lambda x: '' if x is None else str(x))

# Only 1 genre
movies = movies.iloc[(genres.apply(lambda x: sum(x != ''), axis = 1) == 1).values]
movies = movies.rename(columns = {'genres':'Genre 1', 'overview':'Description'})

# Get rid of Mr. and Mrs. split
movies['Description'] = movies['Description'].str.replace('Mrs\\.', 'Mrs')
movies['Description'] = movies['Description'].str.replace('Mr\\.', 'Mr')

# More than 3 sentences
movies = movies.iloc[((~movies['Description'].str.split('\\. ',expand=True).isnull()).apply(sum, axis = 1) > 3).values]

# Both genre and description not null
movies = movies.iloc[((movies[['Genre 1', 'Description']].isnull()).apply(sum, axis = 1) == 0).values]

# Genre share at least 10%
movies = movies.set_index('Genre 1').join(pd.DataFrame(movies.groupby('Genre 1').size()/len(movies) > 0.1)).rename(columns = {0:'to_drop'}).reset_index()

movies = movies.iloc[movies['to_drop'].values]
movies = movies.drop(['to_drop'], axis = 1)

# Splitting by sentence
sentences = pd.DataFrame(movies['Description'].str.split('\\. ',expand=True).unstack()).reset_index().sort_values(['level_1', 'level_0'])
sentences = sentences[sentences[0].apply(lambda x: x is not None)]
sentences = sentences.set_index('level_1').drop('level_0', axis = 1).rename(columns = {0:'overview'})

  interactivity=interactivity, compiler=compiler, result=result)
  values, self.f, axis=self.axis, dummy=dummy, labels=labels


### NLP Processing

In [165]:
from sklearn.feature_extraction.text import CountVectorizer
import string
from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import nltk

def mj_dtm(description):
    
    def get_wordnet_pos(word):
        """Map POS tag to first character lemmatize() accepts"""
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}

        return tag_dict.get(tag, wordnet.NOUN)
    
    stop = stopwords.words('english')

    docs = description

    docs = docs.apply(lambda x: x.translate(str.maketrans({key: None for key in string.punctuation})))
    docs = docs.apply(lambda x: x.lower())

    docs = docs.apply(lambda x: x.split())
    
    wordnet_lemmatizer = WordNetLemmatizer()
    docs = docs.apply(lambda x: [wordnet_lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in x])
    
    lancaster = LancasterStemmer()
    docs = docs.apply(lambda x: [lancaster.stem(word) for word in x])

    docs = docs.apply(lambda x: [word for word in x if word not in stop])
    
    docs = docs.apply(lambda x: ' '.join(x))

#     vec = CountVectorizer()

#     X = vec.fit_transform(docs)
#     df = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())

#     # Words occurred in more than 1 movie
#     df = df.iloc[:,((df>0).apply(sum) > 1).values]

#     df = df.iloc[:, pd.Series(df.columns).apply(lambda x: re.match("^[0-9]", x) is None).values]
#     df = df.iloc[:, pd.Series(df.columns).apply(lambda x: re.match("^[a-z]", x) is not None).values]
    return df

In [166]:
mj_dtm(movies['Description'])

Unnamed: 0,aa,aadh,aamp,aampm,aan,aanwezig,aaron,ab,abab,abac,...,zubin,zug,zukerm,zul,zulu,zum,zurich,zwisch,zwraca,zé
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Bag of Words Creation

In [65]:
df = mj_dtm(movies['Description'])

In [127]:
df_test = mj_dtm(sentences['overview'])
df_test = df_test.set_index(sentences.index.values)

In [162]:
(groups_filter.set_index('Genre 1').sum(axis = 1) > 50).groupby('Genre 1').sum()

Genre 1
Comedy         1803.0
Documentary    3927.0
Drama          3263.0
dtype: float64

In [24]:
# from sklearn.feature_extraction.text import TfidfTransformer
# transformer = TfidfTransformer(smooth_idf=False)

# dtm = movies['Genre 1'].reset_index()[movies['Genre 1'].reset_index().columns.difference(['index'])]
# groups_filter = pd.concat([dtm, df], axis=1)
# tfidf = groups_filter.groupby('Genre 1').apply(lambda x: pd.DataFrame(transformer.fit_transform(x).toarray()))
# # tfidf = pd.DataFrame(tfidf.toarray())
# # tfidf.columns = df.columns.difference(['Genre 1'])
# tfidf.columns = df.columns.values
# tfidf

# # tfidf = tfidf.reset_index()[tfidf.reset_index().columns.difference(['index'])]
# # first_class = pd.concat([dtm, tfidf], axis=1)
# # first_class

  idf = np.log(n_samples / df) + 1
  idf = np.log(n_samples / df) + 1
  idf = np.log(n_samples / df) + 1


Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,5,6,7,8,9,...,20280,20281,20282,20283,20284,20285,20286,20287,20288,20289
Genre 1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Comedy,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Comedy,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Comedy,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Comedy,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Comedy,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Comedy,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Comedy,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Comedy,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Comedy,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Comedy,9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Categorical Cross Entropy

In [67]:
dtm = movies['Genre 1'].reset_index()[movies['Genre 1'].reset_index().columns.difference(['index'])]
groups_filter = pd.concat([dtm, df], axis=1)
sum_groups = groups_filter.groupby('Genre 1').sum()
sum_overall = sum_groups.sum()
categorical_cross_entropy = (1 - (sum_groups * np.log(sum_groups/sum_overall))/(sum_overall*np.log(1/3))).fillna(0)
categorical_cross_entropy

  """


Unnamed: 0_level_0,aa,aadh,aamp,aampm,aan,aanwezig,aaron,ab,abab,abac,...,zubin,zug,zukerm,zul,zulu,zum,zurich,zwisch,zwraca,zé
Genre 1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Comedy,0.0,0.0,0.0,0.0,0.879731,1.0,0.667009,0.666026,0.684535,0.0,...,0.0,0.0,0.0,1.0,0.666667,0.753953,0.0,0.684535,1.0,0.674196
Documentary,1.0,0.0,1.0,1.0,0.746965,0.0,0.667009,0.793329,0.684535,0.753953,...,1.0,0.684535,1.0,0.0,0.728178,0.728178,0.666667,0.684535,0.0,0.0
Drama,0.0,1.0,0.0,0.0,0.0,0.0,0.715736,0.704821,0.684535,0.666667,...,0.0,0.684535,0.0,0.0,0.684535,0.728178,0.753953,0.0,0.0,0.781236


### Choosing genre top_words

In [85]:
comedy_top_words = categorical_cross_entropy.loc['Comedy'][sum_groups.loc['Comedy'].sort_values(ascending = False)[:500].index.values].sort_values(ascending = False)

In [86]:
documentary_top_words = categorical_cross_entropy.loc['Documentary'][sum_groups.loc['Documentary'].sort_values(ascending = False)[:500].index.values].sort_values(ascending = False)

In [87]:
drama_top_words = categorical_cross_entropy.loc['Drama'][sum_groups.loc['Drama'].sort_values(ascending = False)[:500].index.values].sort_values(ascending = False)

In [17]:
def intersection(lst1, lst2): 
    return list(set(lst1) & set(lst2)) 

In [93]:
common_top_words = intersection(intersection(comedy_top_words.index.values, documentary_top_words.index.values), drama_top_words.index.values)

In [124]:
comedy_top_words = comedy_top_words[np.setdiff1d(comedy_top_words.index.values, common_top_words)].sort_values(ascending = False)[:200]

In [125]:
documentary_top_words = documentary_top_words[np.setdiff1d(documentary_top_words.index.values, common_top_words)].sort_values(ascending = False)[:200]

In [126]:
drama_top_words = drama_top_words[np.setdiff1d(drama_top_words.index.values, common_top_words)].sort_values(ascending = False)[:200]

In [62]:
# dtm = movies['Genre 1'].reset_index()[movies['Genre 1'].reset_index().columns.difference(['index'])]
# groups_filter = pd.concat([dtm, df], axis=1)
# groups_filter = groups_filter.groupby('Genre 1').sum()[df.columns]
# groups_filter = groups_filter.T
# groups_filter['n_words'] = groups_filter.apply(sum, axis = 1)

# # Taking into consideration n_words which is always > 0
# groups_filter['is_common'] = groups_filter.apply(lambda x: 1 if sum(x > 0) > 2 else 0, axis = 1)

# groups_filter.sort_values(['is_common', 'n_words'], ascending = False)

# # Get rid of top 20% words which are common (?)
# groups_filter = groups_filter.sort_values(['is_common', 'n_words'], ascending = False).iloc[int(np.floor(groups_filter.shape[0]/5)):groups_filter.shape[0]]

# word_list = list(groups_filter.index)
# word_list

['beirut',
 'bench',
 'berry',
 'biology',
 'blackmails',
 'bombings',
 'booming',
 'bout',
 'bow',
 'branded',
 'brothels',
 'brutality',
 'bryan',
 'bumbling',
 'burlesque',
 'butterfly',
 'ca',
 'cambodia',
 'cameraman',
 'censorship',
 'champ',
 'chelsea',
 'chernobyl',
 'churches',
 'cindy',
 'clinton',
 'coincidence',
 'colin',
 'collage',
 'collected',
 'competitions',
 'compromising',
 'congolese',
 'consequently',
 'consumption',
 'contend',
 'counts',
 'crashed',
 'crashing',
 'creators',
 'crowds',
 'dakota',
 'dana',
 'dash',
 'declares',
 'deer',
 'dense',
 'designers',
 'devi',
 'devices',
 'diego',
 'differently',
 'dim',
 'dimensions',
 'dismay',
 'documentation',
 'donkey',
 'donna',
 'dumped',
 'eclectic',
 'educate',
 'ein',
 'eleanor',
 'eli',
 'embarrassment',
 'emptiness',
 'ethics',
 'eugene',
 'examined',
 'exception',
 'expense',
 'fait',
 'falsely',
 'fare',
 'fascist',
 'fault',
 'fearing',
 'feisty',
 'file',
 'fingers',
 'flights',
 'floating',
 'foil',
 'f

In [63]:
# groups_filter.iloc[:, 0:groups_filter.shape[1]-2]
# groups_filter = groups_filter.iloc[:, 0:groups_filter.shape[1]-2]
# n_filter = (groups_filter > 0).sum()
# n_filter

Genre 1
Comedy         15700
Documentary    19137
Drama          19332
dtype: int64

In [64]:
# groups_filter = groups_filter.T

# def sorted(s, num):
#     tmp = s.sort_values(ascending=False)[:num]  # earlier s.order(..)
# #     tmp.index = range(num)
#     return tmp

# groups_filter = groups_filter[word_list]
# groups_filter
# sorted_words = groups_filter.T.apply(lambda x: sorted(x, 200))
# sorted_words

Genre 1,Comedy,Documentary,Drama
abbey,,15.0,
abel,,,13.0
abraham,,,14.0
accent,9.0,,
acquainted,,,11.0
acted,,,12.0
adjust,,,12.0
adultery,,,12.0
adversity,,13.0,
advises,,,11.0


In [9]:
# # Each row now has 4 intersections with all 4 genres
# df_array = df_test[list(sorted_words.index)].values
# tfm = [row * (~sorted_words.isnull().T) for row in df_array]

In [65]:
# comedy_base = ~sorted_words['Comedy'][~sorted_words.isnull()['Comedy'].T].isnull()
# # horror_base = ~sorted_words['Horror'][~sorted_words.isnull()['Horror'].T].isnull()
# documentary_base = ~sorted_words['Documentary'][~sorted_words.isnull()['Documentary'].T].isnull()
# drama_base = ~sorted_words['Drama'][~sorted_words.isnull()['Drama'].T].isnull()

In [128]:
comedy_test = df_test[comedy_top_words.index.values]
# horror_test = df_test[horror_base.index.values]
documentary_test = df_test[documentary_top_words.index.values]
drama_test = df_test[drama_top_words.index.values]

In [132]:
from sklearn.metrics.pairwise import cosine_similarity
def calculate_dist_matrix(x):
    dist_matrix = np.arccos(np.round(cosine_similarity(x), 5))
    aux_zeros = np.zeros(dist_matrix.shape[0] - 1)
    np.fill_diagonal(dist_matrix[1:], aux_zeros)
    np.fill_diagonal(dist_matrix[:,1:], aux_zeros)
    np.fill_diagonal(dist_matrix, 0)
    return dist_matrix

In [133]:
comedy_test_dist = pd.DataFrame(comedy_test.groupby(comedy_test.index).apply(lambda x: calculate_dist_matrix(x)))
documentary_test_dist = pd.DataFrame(documentary_test.groupby(documentary_test.index).apply(lambda x: calculate_dist_matrix(x)))
drama_test_dist = pd.DataFrame(drama_test.groupby(drama_test.index).apply(lambda x: calculate_dist_matrix(x)))

In [134]:
comedy_test_dist.loc[2017][0]

array([[0.        , 0.        , 0.78539361, 1.57079633],
       [0.        , 0.        , 0.        , 1.57079633],
       [0.78539361, 0.        , 0.        , 0.        ],
       [1.57079633, 1.57079633, 0.        , 0.        ]])

In [135]:
comedy_test_dist.loc[2020][0]

array([[0.        , 0.        , 1.57079633, 1.57079633, 1.57079633],
       [0.        , 0.        , 0.        , 0.78539361, 1.57079633],
       [1.57079633, 0.        , 0.        , 0.        , 1.57079633],
       [1.57079633, 0.78539361, 0.        , 0.        , 0.        ],
       [1.57079633, 1.57079633, 1.57079633, 0.        , 0.        ]])

### VRC

In [136]:
import scipy as sp
from rpy2.robjects.packages import importr
import rpy2.robjects as ro
from rpy2.robjects import r
# import pandas.rpy.common as com

# import R's "base" package
base = importr('base')

# import R's "utils" package
utils = importr('utils')

# import rpy2's package module
import rpy2.robjects.packages as rpackages

# import R's utility package
utils = rpackages.importr('utils')

# select a mirror for R packages
utils.chooseCRANmirror(ind=1) # select the first mirror in the list

# R package names
packnames = ('TDA')

# R vector of strings
from rpy2.robjects.vectors import StrVector

utils.install_packages(StrVector('TDA'))
r('library(TDA)')

0,1,2,3,4,5,6
'TDA','tools','stats',...,'datasets','methods','base'


In [137]:
import rpy2
from rpy2.robjects import pandas2ri # install any dependency package if you get error like "module not found"
from rpy2.robjects.conversion import localconverter
from rpy2.robjects import globalenv

def r_convert(x):
    pandas2ri.activate()
    
    aux_tmp = pd.DataFrame(pd.Series(x.values.tolist()).loc[0][0])
    
    with localconverter(ro.default_converter + pandas2ri.converter):
        r_df = ro.conversion.py2rpy(aux_tmp)
        
    globalenv['r_df'] = r_df
    r('Diag <- ripsDiag(X = r_df, 1, max(r_df), library = "Dionysus", dist = "arbitrary", printProgress = FALSE)')
    return(r('Diag$diagram'))

In [138]:
comedy_rips = comedy_test_dist.groupby(comedy_test_dist.index).apply(lambda x: r_convert(x))
# horror_rips = horror_test.groupby(horror_test.index).apply(lambda x: r_convert(x))
documentary_rips = documentary_test_dist.groupby(documentary_test_dist.index).apply(lambda x: r_convert(x))
drama_rips = drama_test_dist.groupby(drama_test_dist.index).apply(lambda x: r_convert(x))

In [148]:
comedy_rips.apply(lambda x: np.any(x[:, 0] == 1)).values.mean()
documentary_rips.apply(lambda x: np.any(x[:, 0] == 1)).values.mean()
drama_rips.apply(lambda x: np.any(x[:, 0] == 1)).values.mean()

0.1153169014084507

In [152]:
movies.loc[comedy_rips[comedy_rips.apply(lambda x: np.any(x[:, 0] == 1)).values].index.values]['Genre 1'].value_counts()

Drama          746
Comedy         626
Documentary    463
Name: Genre 1, dtype: int64

In [139]:
(movies.loc[comedy_rips[comedy_rips.apply(lambda x: np.any(x[:, 0] == 1)).values].index.values]['Genre 1'] == 'Comedy').mean()

0.3411444141689373

In [140]:
(movies.loc[documentary_rips[documentary_rips.apply(lambda x: np.any(x[:, 0] == 1)).values].index.values]['Genre 1'] == 'Documentary').mean()

0.6551528878822197

In [141]:
(movies.loc[drama_rips[drama_rips.apply(lambda x: np.any(x[:, 0] == 1)).values].index.values]['Genre 1'] == 'Drama').mean()

0.48909487459105777

In [272]:
r('dev.new()') # optional: create a new figure
r('plot(Diag$diagram, barcode=TRUE)')
r('dev.off()')

R[write to console]: dev.new(): using pdf(file="Rplots1.pdf")



array([1], dtype=int32)

In [None]:

df = df.iloc[:, (groups_filter.apply(lambda x: sum(x != 0)) == 1).values]

from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(smooth_idf=False)
tfidf = transformer.fit_transform(df)
tfidf = pd.DataFrame(tfidf.toarray())
tfidf.columns = df.columns.difference(['Genre 1'])
tfidf

tfidf = tfidf.reset_index()[tfidf.reset_index().columns.difference(['index'])]
first_class = pd.concat([dtm, tfidf], axis=1)
first_class