In [98]:
import re
import string

import nltk
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.linear_model import LogisticRegression

import matplotlib.pyplot as plt

In [2]:
wiki_data_all = pd.read_csv('./wiki_movie_plots_deduped.csv')
wiki_data_all = wiki_data_all[~wiki_data_all['Genre'].str.contains('unknown')]
wiki_data_all

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
6,1903,The Great Train Robbery,American,Edwin S. Porter,,western,https://en.wikipedia.org/wiki/The_Great_Train_...,The film opens with two bandits breaking into ...
7,1904,The Suburbanite,American,Wallace McCutcheon,,comedy,https://en.wikipedia.org/wiki/The_Suburbanite,The film is about a family who move to the sub...
10,1906,Dream of a Rarebit Fiend,American,Wallace McCutcheon and Edwin S. Porter,,short,https://en.wikipedia.org/wiki/Dream_of_a_Rareb...,The Rarebit Fiend gorges on Welsh rarebit at a...
11,1906,From Leadville to Aspen: A Hold-Up in the Rockies,American,Francis J. Marion and Wallace McCutcheon,,short action/crime western,https://en.wikipedia.org/wiki/From_Leadville_t...,The film features a train traveling through th...
12,1906,Kathleen Mavourneen,American,Edwin S. Porter,,short film,https://en.wikipedia.org/wiki/Kathleen_Mavourn...,Irish villager Kathleen is a tenant of Captain...
...,...,...,...,...,...,...,...,...
34877,2013,Particle (film),Turkish,Erdem Tepegöz,"Jale Arıkan, Rüçhan Caliskur, Özay Fecht, Remz...",drama film,https://en.wikipedia.org/wiki/Particle_(film),"Zeynep lost her job at weaving factory, and he..."
34882,2017,Çalgı Çengi İkimiz,Turkish,Selçuk Aydemir,"Ahmet Kural, Murat Cemcir",comedy,https://en.wikipedia.org/wiki/%C3%87alg%C4%B1_...,"Two musicians, Salih and Gürkan, described the..."
34883,2017,Olanlar Oldu,Turkish,Hakan Algül,"Ata Demirer, Tuvana Türkay, Ülkü Duru",comedy,https://en.wikipedia.org/wiki/Olanlar_Oldu,"Zafer, a sailor living with his mother Döndü i..."
34884,2017,Non-Transferable,Turkish,Brendan Bradley,"YouTubers Shanna Malcolm, Shira Lazar, Sara Fl...",romantic comedy,https://en.wikipedia.org/wiki/Non-Transferable...,The film centres around a young woman named Am...


In [54]:
class WikiMovieModel:
    
    def __init__(self):
        print()
        
        
def filter_genre(genre): 
    return [g.split('(')[0].strip().lower() for g in re.split(',|/', genre) if ')' not in g]
    
    
def filter_plot(plot):
    plot = ' '.join(re.split(r'\[\d+\]', plot))
    return ' '.join([s.strip(string.punctuation) for s in plot.encode('ascii', 'ignore').decode().split()])

In [117]:
wiki_data_all['genre_split'] = wiki_data_all['Genre'].apply(filter_genre)

wiki_data_exploded = wiki_data_all.explode('genre_split')
top_30_genre = list(wiki_data_exploded['genre_split'].value_counts()[:3].keys())

wiki_data_reduced = wiki_data_exploded[wiki_data_exploded['genre_split'].isin(top_30_genre)].reset_index(drop=True)
wiki_data_reduced['plot_filtered'] = wiki_data_reduced['Plot'].apply(filter_plot)


wiki_data_reduced

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot,genre_split,plot_filtered
0,1904,The Suburbanite,American,Wallace McCutcheon,,comedy,https://en.wikipedia.org/wiki/The_Suburbanite,The film is about a family who move to the sub...,comedy,The film is about a family who move to the sub...
1,1907,How Brown Saw the Baseball Game,American,Unknown,Unknown,comedy,https://en.wikipedia.org/wiki/How_Brown_Saw_th...,Before heading out to a baseball game at a nea...,comedy,Before heading out to a baseball game at a nea...
2,1907,Laughing Gas,American,Edwin Stanton Porter,"Bertha Regustus, Edward Boulden",comedy,https://en.wikipedia.org/wiki/Laughing_Gas_(fi...,The plot is that of a black woman going to the...,comedy,The plot is that of a black woman going to the...
3,1908,The Adventures of Dollie,American,D. W. Griffith,"Arthur V. Johnson, Linda Arvidson",drama,https://en.wikipedia.org/wiki/The_Adventures_o...,On a beautiful summer day a father and mother ...,drama,On a beautiful summer day a father and mother ...
4,1908,The Black Viper,American,D. W. Griffith,D. W. Griffith,drama,https://en.wikipedia.org/wiki/The_Black_Viper,A thug accosts a girl as she leaves her workpl...,drama,A thug accosts a girl as she leaves her workpl...
...,...,...,...,...,...,...,...,...,...,...
15811,2011,White as Snow,Turkish,Selim Güneş,"Hakan Korkmaz, Sinem İslamoğlu & Gürkan Piri O...",drama,https://en.wikipedia.org/wiki/White_as_Snow_(f...,Hasan is a twelve-year-old boy living with his...,drama,Hasan is a twelve-year-old boy living with his...
15812,2011,Once Upon a Time in Anatolia,Turkish,Nuri Bilge Ceylan,"Yılmaz Erdoğan, Taner Birsel & Ufuk Karaali",drama,https://en.wikipedia.org/wiki/Once_Upon_a_Time...,"Through the night, three cars carry a small gr...",drama,Through the night three cars carry a small gro...
15813,2013,Selam,Turkish,Levent Demirkale,"Bucin Abdullah, Selma Alispahic, Tina Cvitanov...",drama,https://en.wikipedia.org/wiki/Selam_(film),The film opens with a Senegalese boy named Kha...,drama,The film opens with a Senegalese boy named Kha...
15814,2017,Çalgı Çengi İkimiz,Turkish,Selçuk Aydemir,"Ahmet Kural, Murat Cemcir",comedy,https://en.wikipedia.org/wiki/%C3%87alg%C4%B1_...,"Two musicians, Salih and Gürkan, described the...",comedy,Two musicians Salih and Grkan described the ad...


In [118]:
X = wiki_data_reduced['plot_filtered']
y = wiki_data_reduced['genre_split']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=123)
X_train, y_train

(13808    The story of a man called Govinda who falls in...
 11364    Hang Yuan a man who's been in love with a frie...
 1244     Captain John Winslow Arthur Byron is notified ...
 2786     William Thompson William Lundigan is a ministe...
 6530     As children Marty and Carol Lakewood fraternal...
                                ...                        
 12252    Bharathi plays a deserted wife to Vinod Mehra ...
 1346     Chivo a singer who works in a movie theater pr...
 11646    According to the legend before the God of Gamb...
 15725    The plot concerns Sam-ryong a deaf servant who...
 3582     Parker Ballantine Bob Hope is a theatrical cri...
 Name: plot_filtered, Length: 11071, dtype: object, 13808     drama
 11364    comedy
 1244      drama
 2786      drama
 6530      drama
           ...  
 12252    comedy
 1346     comedy
 11646     drama
 15725     drama
 3582     comedy
 Name: genre_split, Length: 11071, dtype: object)

In [119]:
count_vectorizer = CountVectorizer(ngram_range=(1,1), max_features=10000)

X_train_transformed = count_vectorizer.fit_transform(X_train)
X_test_transformed = count_vectorizer.transform(X_test)

X_test_transformed

<4745x10000 sparse matrix of type '<class 'numpy.int64'>'
	with 767674 stored elements in Compressed Sparse Row format>

In [120]:
log_reg = LogisticRegression(max_iter=1000,n_jobs=-1).fit(X_train_transformed, y_train)
pred = log_reg.predict(X_test_transformed)
sum(pred == y_test)/len(y_test)

0.6166491043203371