### Import & Data Preprocessing

In [811]:
import pandas as pd
import os
import numpy as np
import math
from langdetect import detect


path = '../tda'
os.chdir(path)

movies = pd.read_csv('movies_genres.csv', delimiter='\t')
movies['plot_lang'] = movies.apply(lambda row: detect(row['plot']), axis=1)
movies = movies[movies.plot_lang.isin(['en'])]

# Only 1 genre
movies = movies[movies.iloc[:, 2:30].sum(axis = 1) == 1]
genres = movies.iloc[:, 2:30].replace(1, pd.Series(movies.columns, movies.columns)).replace(0, "").max(axis = 1)
movies = pd.concat([movies[['title', 'plot']], genres], axis = 1).rename(columns={0:'genre'})

# Get rid of strange dots, replace question & exclamation marks
movies.loc['plot'] = movies['plot'].str.replace('\\.\\.\\.', '\\.')
movies.loc['plot'] = movies['plot'].str.replace('\\.\\.', '\\.')
movies.loc['plot'] = movies['plot'].str.replace('Mrs\\.', 'Mrs')
movies.loc['plot'] = movies['plot'].str.replace('Mr\\.', 'Mr')
movies.loc['plot'] = movies['plot'].str.replace('\\?', '\\.')
movies.loc['plot'] = movies['plot'].str.replace('\\!', '\\.')

# More than 3 sentences
movies = movies.iloc[((~movies['plot'].str.split('\\. ',expand=True).isnull()).apply(sum, axis = 1) > 3).values]

# Genre is Comedy, Documentary or Drama
movies = movies[movies['genre'].apply(lambda x: x in ['Comedy', 'Documentary', 'Drama'])]

# Choosing base set
movies = pd.concat([movies[movies['genre'] == 'Comedy'].sample(2800, random_state = 1),
movies[movies['genre'] == 'Documentary'].sample(2800, random_state = 2),
movies[movies['genre'] == 'Drama'].sample(2800, random_state = 3)])

# Choosing test set
movies_test = pd.concat([movies[movies['genre'] == 'Comedy'].sample(200, random_state = 4),
movies[movies['genre'] == 'Documentary'].sample(200, random_state = 5),
movies[movies['genre'] == 'Drama'].sample(200, random_state = 6)])

# Splitting test by sentence
sentences = pd.DataFrame(movies_test['plot'].str.split('\\. ',expand=True).unstack()).reset_index().sort_values(['level_1', 'level_0'])
sentences = sentences[sentences[0].apply(lambda x: x is not None)]
sentences = sentences.set_index('level_1').drop('level_0', axis = 1).rename(columns = {0:'plot'})

movies = movies.loc[np.setdiff1d(movies.index.values, movies_test.index.values)]

# Genres
genres = movies['genre']

### NLP Processing

In [897]:
stop = stopwords.words('english')

In [904]:
stop = pd.Series(stop).apply(lambda x: x.translate(str.maketrans({key: None for key in string.punctuation})))

0            i
1           me
2           my
3       myself
4           we
        ...   
174     werent
175        won
176       wont
177     wouldn
178    wouldnt
Length: 179, dtype: object

In [905]:
stop = stop.apply(lambda x: x.lower())

AttributeError: 'list' object has no attribute 'apply'

In [901]:
stop.translate(str.maketrans({key: None for key in string.punctuation}))
# docs = docs.apply(lambda x: x.lower())

AttributeError: 'list' object has no attribute 'translate'

In [605]:
from sklearn.feature_extraction.text import CountVectorizer
import string
from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import nltk

def mj_dtm(description):
    
    def get_wordnet_pos(word):
        """Map POS tag to first character lemmatize() accepts"""
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}

        return tag_dict.get(tag, wordnet.NOUN)
    
    stop = stopwords.words('english')

    docs = description

    docs = docs.apply(lambda x: x.translate(str.maketrans({key: None for key in string.punctuation})))
    docs = docs.apply(lambda x: x.lower())

    docs = docs.apply(lambda x: x.split())
    
    wordnet_lemmatizer = WordNetLemmatizer()
    docs = docs.apply(lambda x: [wordnet_lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in x])
    
#     lancaster = LancasterStemmer()
#     docs = docs.apply(lambda x: [lancaster.stem(word) for word in x])

    docs = docs.apply(lambda x: [word for word in x if word not in stop])
    
    docs = docs.apply(lambda x: ' '.join(x))

    vec = CountVectorizer()

    X = vec.fit_transform(docs)
    df = pd.DataFrame(X.toarray(), columns=vec.get_feature_names(), index = docs.index)

    # Words occurred in more than 1 movie
#     df = df.iloc[:,((df>0).apply(sum) > 1).values]

    df = df.iloc[:, pd.Series(df.columns).apply(lambda x: re.match("^[0-9]", x) is None).values]
    df = df.iloc[:, pd.Series(df.columns).apply(lambda x: re.match("^[a-z]", x) is not None).values]
    
    return df

In [260]:
df = mj_dtm(movies['Description'])

In [299]:
top_movies = pd.concat([genres, df.sum(axis = 1)], axis = 1).groupby('Genre 1')\
.apply(lambda x: x.sort_values(ascending = False, by = 0)[:1600]).index.get_level_values(None).values

In [308]:
movies = movies.loc[top_movies]

In [319]:
df = mj_dtm(movies['Description'])

### Choosing genre top_words

In [553]:
groups_filter = pd.concat([movies['Genre 1'], df], axis=1)
sum_groups = groups_filter.groupby('Genre 1').sum()
sum_overall = sum_groups.sum()
# categorical_cross_entropy = (1 - (sum_groups * np.log(sum_groups/sum_overall))/(sum_overall*np.log(1/3))).fillna(0)
# categorical_cross_entropy
categorical_shares = sum_groups/sum_overall

In [555]:
(groups_filter.set_index('Genre 1').sum(axis = 1) > 40).groupby('Genre 1').sum()

Genre 1
Comedy         1600.0
Documentary    1600.0
Drama          1600.0
dtype: float64

In [566]:
comedy_top_words = categorical_shares.loc['Comedy'][sum_groups.loc['Comedy'].sort_values(ascending = False)[:500]\
                                                    .index.values].sort_values(ascending = False)[:200]

In [567]:
comedy_top_words

stooge      1.000000
det         0.983607
que         0.968254
standup     0.943182
til         0.936170
              ...   
teach       0.378947
learns      0.377049
star        0.376884
another     0.376068
everyone    0.375000
Name: Comedy, Length: 200, dtype: float64

In [569]:
documentary_top_words = categorical_shares.loc['Documentary'][sum_groups.loc['Documentary'].sort_values(ascending = False)[:500]\
                                                              .index.values].sort_values(ascending = False)[:200]

In [570]:
documentary_top_words

documentary    0.944538
examines       0.923077
wrestling      0.893939
document       0.875000
insight        0.868421
                 ...   
issue          0.504274
war            0.503371
understand     0.500000
building       0.500000
york           0.497738
Name: Documentary, Length: 200, dtype: float64

In [571]:
drama_top_words = categorical_shares.loc['Drama'][sum_groups.loc['Drama'].sort_values(ascending = False)[:500]\
                                                  .index.values].sort_values(ascending = False)[:200]

In [572]:
drama_top_words

anna       0.752941
drama      0.681564
tragedy    0.666667
unable     0.666667
dy         0.663265
             ...   
later      0.412121
drug       0.410811
act        0.410811
door       0.410000
name       0.409922
Name: Drama, Length: 200, dtype: float64

### Intersection

In [622]:
def intersection(lst1, lst2): 
    return list(set(lst1) & set(lst2))

In [623]:
intersection(mj_dtm(sentences[:5]['overview']).columns.values, comedy_top_words.index.values)

['boyfriend',
 'manage',
 'seem',
 'let',
 'win',
 'trip',
 'want',
 'however',
 'shes']

In [624]:
def intersecter(x, base):
    res = mj_dtm(x)
    res = res[intersection(res.columns.values, base.index.values)]
    return res

### Cosine similarity

In [728]:
from sklearn.metrics.pairwise import cosine_similarity
def calculate_dist_matrix(x):
    
    if len(x.columns) == 0:
        return np.array([[0]])
    else:
        dist_matrix = np.arccos(np.round(cosine_similarity(x), 5))
        aux_zeros = np.zeros(dist_matrix.shape[0] - 1)
        np.fill_diagonal(dist_matrix[1:], aux_zeros)
        np.fill_diagonal(dist_matrix[:,1:], aux_zeros)
        np.fill_diagonal(dist_matrix, 0)
        return dist_matrix

### VRC

In [633]:
import scipy as sp
from rpy2.robjects.packages import importr
import rpy2.robjects as ro
from rpy2.robjects import r
# import pandas.rpy.common as com

# import R's "base" package
base = importr('base')

# import R's "utils" package
utils = importr('utils')

# import rpy2's package module
import rpy2.robjects.packages as rpackages

# import R's utility package
utils = rpackages.importr('utils')

# select a mirror for R packages
utils.chooseCRANmirror(ind=1) # select the first mirror in the list

# R package names
packnames = ('TDA')

# R vector of strings
from rpy2.robjects.vectors import StrVector

utils.install_packages(StrVector('TDA'))
r('library(TDA)')

R[write to console]:  failed to download mirrors file (cannot download all files); using local file '/Library/Frameworks/R.framework/Resources/doc/CRAN_mirrors.csv'



array(['TDA', 'tools', 'stats', 'graphics', 'grDevices', 'utils',
       'datasets', 'methods', 'base'], dtype='<U9')

In [721]:
import rpy2
from rpy2.robjects import pandas2ri # install any dependency package if you get error like "module not found"
from rpy2.robjects.conversion import localconverter
from rpy2.robjects import globalenv

def r_convert(x):
    pandas2ri.activate()
    
    aux_tmp = pd.Series(x.tolist()).loc[0][0]
    
    with localconverter(ro.default_converter + pandas2ri.converter):
        r_df = ro.conversion.py2rpy(aux_tmp)
        
    globalenv['r_df'] = r_df
    r('Diag <- ripsDiag(X = r_df, 1, max(r_df), library = "Dionysus", dist = "arbitrary", printProgress = FALSE)')
    return(r('Diag$diagram'))

In [752]:
r_convert(calculate_dist_matrix(intersecter(sentences.loc[2024]['overview'], comedy_top_words)))

array([[0., 0., 0.]])

In [733]:
comedy_rips = sentences.groupby('level_1')\
.apply(lambda x: r_convert(calculate_dist_matrix(intersecter(x['overview'], comedy_top_words))))
comedy_rips

level_1
2024     [[0.0, 0.0, 0.0]]
2030     [[0.0, 0.0, 0.0]]
2057     [[0.0, 0.0, 0.0]]
2062     [[0.0, 0.0, 0.0]]
2096     [[0.0, 0.0, 0.0]]
               ...        
17871    [[0.0, 0.0, 0.0]]
17891    [[0.0, 0.0, 0.0]]
17892    [[0.0, 0.0, 0.0]]
18041    [[0.0, 0.0, 0.0]]
18064    [[0.0, 0.0, 0.0]]
Length: 600, dtype: object

In [734]:
documentary_rips = sentences.groupby('level_1')\
.apply(lambda x: r_convert(calculate_dist_matrix(intersecter(x['overview'], documentary_top_words))))
documentary_rips

level_1
2024     [[0.0, 0.0, 0.0]]
2030     [[0.0, 0.0, 0.0]]
2057     [[0.0, 0.0, 0.0]]
2062     [[0.0, 0.0, 0.0]]
2096     [[0.0, 0.0, 0.0]]
               ...        
17871    [[0.0, 0.0, 0.0]]
17891    [[0.0, 0.0, 0.0]]
17892    [[0.0, 0.0, 0.0]]
18041    [[0.0, 0.0, 0.0]]
18064    [[0.0, 0.0, 0.0]]
Length: 600, dtype: object

In [735]:
drama_rips = sentences.groupby('level_1')\
.apply(lambda x: r_convert(calculate_dist_matrix(intersecter(x['overview'], drama_top_words))))
drama_rips

level_1
2024     [[0.0, 0.0, 0.0]]
2030     [[0.0, 0.0, 0.0]]
2057     [[0.0, 0.0, 0.0]]
2062     [[0.0, 0.0, 0.0]]
2096     [[0.0, 0.0, 0.0]]
               ...        
17871    [[0.0, 0.0, 0.0]]
17891    [[0.0, 0.0, 0.0]]
17892    [[0.0, 0.0, 0.0]]
18041    [[0.0, 0.0, 0.0]]
18064    [[0.0, 0.0, 0.0]]
Length: 600, dtype: object

In [736]:
print(comedy_rips.apply(lambda x: np.any(x[:, 0] == 1)).values.mean(),
documentary_rips.apply(lambda x: np.any(x[:, 0] == 1)).values.mean(),
drama_rips.apply(lambda x: np.any(x[:, 0] == 1)).values.mean())

0.0 0.0 0.0


In [139]:
(movies.loc[comedy_rips[comedy_rips.apply(lambda x: np.any(x[:, 0] == 1)).values].index.values]['Genre 1'] == 'Comedy').mean()

0.3411444141689373

In [140]:
(movies.loc[documentary_rips[documentary_rips.apply(lambda x: np.any(x[:, 0] == 1)).values].index.values]['Genre 1'] == 'Documentary').mean()

0.6551528878822197

In [141]:
(movies.loc[drama_rips[drama_rips.apply(lambda x: np.any(x[:, 0] == 1)).values].index.values]['Genre 1'] == 'Drama').mean()

0.48909487459105777

In [272]:
r('dev.new()') # optional: create a new figure
r('plot(Diag$diagram, barcode=TRUE)')
r('dev.off()')

R[write to console]: dev.new(): using pdf(file="Rplots1.pdf")



array([1], dtype=int32)