In this notebook we'll try to train a rudimentary NLP solution against our target variable.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
shows = pd.read_pickle("no_na_pre2017_v4.pkl")

In [2]:
shows.columns

Index(['genre', 'link', 'network', 'status', 'tagline', 'title', 'years',
       'start_year', 'end_year', 'synopsis', 'primary_genre',
       'secondary_genre', 'Comedy', 'Drama', 'Game Show', 'Reality', 'Sci-fi',
       'Talk', 'Crime', 'Action', 'Fantasy', 'Animated', 'Horror', 'Legal',
       'Medical', 'tvmaze_rating', 'tvmaze_ep_day', 'imdb_actors',
       'imdb_awards', 'imdb_country', 'imdb_director', 'imdb_genre',
       'imdb_language', 'imdb_plot', 'imdb_rated', 'imdb_website',
       'imdb_writer', 'imdb_year', 'imdb_rating', 'imdb_votes',
       'imdb_totalseasons', 'prem_date', 'quarter', 'month', 'yr_quarter',
       'yr_mo', 'yrs_run', 'one_season_cancel', 'two_season_cancel',
       'three_season_cancel', 'broadcast_flag', 'streaming_flag',
       'premium_cable_flag', 'basic_cable_flag', 'network_type',
       'pct_one_canc', 'pct_two_canc', 'risky_1season', 'risky_2season',
       'risky_3season', 'runtime', 'country', 'rounded_rating', 'rating_cat',
       'log_imdb

In [3]:
shows[['tagline','synopsis','imdb_plot']]

Unnamed: 0,tagline,synopsis,imdb_plot
0,"A sitcom based on the Twitter feed ""S*** My Da...",Ed is an opinionated and divorced 72-year-old ...,This show is about Ed Goodson a very old fashi...
2,A game show competition where contestants are ...,"In 101 Ways to Leave a Game Show, contestants ...",People must answere questions correctly and if...
3,A drama following a man sent back in time to p...,"By the year 2043, a deadly virus has wiped out...",Follows the journey of a time traveler from th...
5,A reality series documenting the hardships of ...,"""16 and Pregnant"" follows stories of various p...",A documentary series focusing on the controver...
6,A comedy following a dysfunctional family livi...,The Gilchrists are a typical American family w...,President Gilcrest may be foremost the US head...
7,A Canadian comedy series about a modern-day Ro...,Tom and Jessie are 18-year-old next-door neigh...,Two couples with opposing points of view reluc...
8,A reality series following the Duggar family.,Cameras in this reality series follow Jim Bob ...,"The lives of the Duggars, a Christian homescho..."
9,A comedy following the lives of two waitresses...,Max is a twenty-something girl who comes from ...,"Sassy, streetwise Max works two jobs just to g..."
10,A dramatic thriller where the action takes pla...,Jack Bauer works for the Counter Terrorist Uni...,"In this concept drama, each season takes place..."
12,An action drama following Jack Bauer as he att...,"The award-winning, high-octane drama returns i...","Four years after the events of Day 8, Jack Bau..."


Note: I tried the below with and without IDF (i.e. bag of words), with no better results.

In [4]:
vectorizer = TfidfVectorizer(lowercase=True,
                            stop_words='english',
                            max_df = 0.75,
                            min_df = 0.02)
                            #use_idf=False)

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [6]:
lrc = LogisticRegression(penalty='l1')
rfc = RandomForestClassifier(n_estimators=200)

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(shows['synopsis'].astype('O'),shows['three_season_cancel']\
                                                    ,test_size=0.25,random_state=0)

In [8]:
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [9]:
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline

svd= TruncatedSVD(30)
lsa = make_pipeline(svd, Normalizer(copy=False))
X_train_lsa = lsa.fit_transform(X_train_tfidf)

X_test_lsa = lsa.transform(X_test_tfidf)

In [10]:
lrc.fit(X_train_lsa,Y_train)
Y_pred = lrc.predict(X_test_lsa)

In [11]:
from sklearn import metrics
metrics.accuracy_score(Y_pred,Y_test)

0.5288753799392097

In [12]:
pd.DataFrame(X_train_lsa)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,0.477431,0.230467,0.261779,0.417173,0.156003,-0.022026,-0.259596,0.304818,-0.111523,0.090395,...,0.197055,0.082400,0.023008,0.128614,0.012188,0.035206,0.169999,0.077046,-0.014727,0.032384
1,0.397894,-0.201220,-0.010235,-0.165234,-0.051893,0.441116,0.006216,0.141009,0.413867,0.007205,...,0.127798,-0.056793,0.076455,-0.040105,-0.152250,0.135027,-0.212617,-0.059637,-0.256194,-0.129074
2,0.261855,-0.025272,-0.194440,-0.266864,0.065895,-0.120182,0.066071,-0.193517,-0.169133,-0.017188,...,0.193954,0.014744,0.014941,-0.037714,0.249405,0.231791,-0.175954,0.281558,-0.061297,-0.249190
3,0.247501,0.242594,0.000247,0.158558,0.160888,0.047644,-0.395994,0.522177,-0.288441,0.141077,...,-0.071383,0.020477,-0.151894,0.140894,0.037038,0.105742,0.133882,-0.068747,0.023874,0.090301
4,0.465528,-0.232374,-0.171626,0.337815,-0.208965,-0.155715,0.206331,0.111640,-0.030593,-0.004908,...,-0.246600,-0.214463,0.020653,-0.135679,0.030600,-0.117651,0.006786,-0.157777,0.018181,-0.169010
5,0.435468,0.392524,0.305397,0.146445,-0.141861,-0.051625,-0.202569,0.280427,-0.194866,0.051275,...,-0.114950,0.101283,-0.116467,0.044998,0.040255,-0.099616,-0.091617,-0.025964,0.284473,-0.066484
6,0.106878,0.209106,-0.021528,0.063457,-0.264332,0.135358,0.067716,-0.238581,-0.071246,-0.169153,...,0.379720,0.112932,-0.214685,0.301221,0.145963,-0.180470,0.181530,0.101925,-0.103539,-0.164381
7,0.255176,-0.067529,-0.173177,-0.218114,-0.008299,-0.383563,0.192106,0.157645,0.003217,0.001279,...,0.318708,-0.084303,-0.134404,0.058153,-0.134261,0.102145,0.245670,0.123327,-0.119307,0.054503
8,0.309077,0.166678,0.007219,0.167826,0.656601,0.130271,0.315498,-0.130993,0.384414,-0.046433,...,0.002787,0.189156,-0.043600,0.107958,-0.089066,0.070524,-0.028558,-0.124229,-0.020307,-0.069901
9,0.117795,0.093878,-0.178979,-0.203388,-0.011435,-0.394292,0.114439,0.034111,0.235060,-0.026702,...,-0.162825,-0.021836,0.141834,-0.146133,-0.019166,-0.123209,0.173526,-0.065510,-0.112255,0.177890


In [13]:
Other_X = pd.concat([pd.get_dummies(shows['network'].astype('O')),shows[['Comedy', 'Drama', 'Game Show', 'Reality', 'Sci-fi',
       'Talk', 'Crime', 'Action', 'Fantasy', 'Animated', 'Horror', 'Legal',
       'Medical','quarter','month']]],axis=1)

In [14]:
X_train_other, X_test_other = \
    train_test_split(Other_X,test_size=0.25,random_state=0)

In [15]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

scaler = Normalizer()

X_scaled = scaler.fit_transform(X_train_other)
pca = PCA(n_components=15)
X_pca_other_train = pd.DataFrame(pca.fit_transform(X_scaled))

X_scaled_test = scaler.transform(X_test_other)
X_pca_other_test = pd.DataFrame(pca.transform(X_scaled_test))

In [16]:
X_train_lsa

array([[ 0.4774315 ,  0.23046727,  0.26177853, ...,  0.07704648,
        -0.01472676,  0.03238405],
       [ 0.39789382, -0.20122016, -0.01023511, ..., -0.0596373 ,
        -0.25619434, -0.1290736 ],
       [ 0.26185513, -0.02527158, -0.19444046, ...,  0.28155795,
        -0.06129707, -0.24918988],
       ...,
       [ 0.38130252, -0.32333653, -0.14467692, ...,  0.30082612,
         0.02215248,  0.16826893],
       [ 0.30057057,  0.00741446,  0.01668063, ...,  0.08999452,
        -0.32506789,  0.24792997],
       [ 0.18980672,  0.04298615, -0.20908828, ..., -0.04899454,
        -0.05096206, -0.02995797]])

In [17]:
X_pca_other_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,-0.055521,-0.087617,-0.152292,-0.034845,0.002393,-0.018091,0.009022,-0.000954,-0.018359,-0.051444,-0.027569,0.136370,0.030776,-0.006903,0.006541
1,0.335287,-0.295193,0.367070,-0.002467,-0.076791,-0.053212,0.005507,-0.011291,-0.044183,-0.063177,-0.004166,-0.021513,-0.005890,-0.005808,0.046882
2,-0.078045,0.069481,-0.006145,-0.013081,-0.007646,-0.028393,-0.011602,0.026691,-0.001095,-0.009786,0.013770,-0.019894,0.072701,-0.004800,-0.031813
3,0.378574,-0.287494,-0.340968,-0.073796,-0.028650,-0.020145,0.015351,0.010098,-0.005880,-0.006052,0.002437,-0.057667,-0.006559,-0.000534,0.014415
4,0.486344,0.290617,0.041008,-0.115326,-0.231610,0.403166,-0.018073,0.019324,-0.078398,0.053035,-0.048923,-0.009331,0.045423,-0.127022,0.102959
5,-0.066121,-0.059348,-0.105613,-0.025236,-0.000112,-0.019978,-0.009389,0.009255,-0.001287,0.005583,0.004437,-0.024423,-0.003555,-0.000704,0.010844
6,-0.144071,0.009528,-0.014283,-0.013152,0.001212,-0.021355,-0.026919,0.070100,0.033633,0.051435,-0.004717,0.010355,0.023775,0.025726,0.014630
7,0.493932,0.315697,0.029243,-0.018958,0.024880,-0.039131,0.257309,0.137527,0.209669,-0.176755,0.034995,-0.023066,-0.070955,-0.011851,0.101181
8,-0.027806,-0.131710,-0.224775,-0.036904,0.012074,-0.000951,0.029978,-0.027852,-0.016727,-0.048598,-0.009409,-0.081518,-0.061434,0.022788,-0.125401
9,0.488420,0.329897,0.018178,-0.067592,-0.124458,-0.143384,-0.207058,-0.221619,0.225976,0.009613,0.024083,0.034156,-0.017196,-0.152128,-0.071183


In [18]:
full_xtrain = pd.concat([X_pca_other_train,pd.DataFrame(X_train_lsa)],axis=1)
full_xtest = pd.concat([X_pca_other_test,pd.DataFrame(X_test_lsa)],axis=1)

In [19]:
for svd in np.arange(1,101):
    svd= TruncatedSVD(svd)
    lsa = make_pipeline(svd, Normalizer(copy=False))
    X_train_lsa = lsa.fit_transform(X_train_tfidf)

    X_test_lsa = lsa.transform(X_test_tfidf)
    
    rfc.fit(X_train_lsa,Y_train)
    Y_pred = rfc.predict(X_test_lsa)
    print(metrics.accuracy_score(Y_pred,Y_test))

0.5075987841945289
0.5319148936170213
0.48024316109422494
0.49240121580547114
0.49544072948328266
0.44984802431610943
0.5288753799392097
0.46808510638297873
0.46808510638297873
0.5531914893617021
0.49544072948328266
0.48024316109422494
0.49848024316109424
0.5167173252279635
0.5197568389057751
0.46808510638297873
0.5501519756838906
0.5349544072948328
0.5106382978723404
0.49848024316109424
0.5288753799392097
0.5258358662613982
0.513677811550152
0.5197568389057751
0.547112462006079
0.5501519756838906
0.5562310030395137
0.49848024316109424
0.5288753799392097
0.5319148936170213
0.5379939209726444
0.5349544072948328
0.5379939209726444
0.5349544072948328
0.5227963525835866
0.513677811550152
0.5531914893617021
0.5167173252279635
0.5288753799392097
0.513677811550152
0.5379939209726444
0.5227963525835866
0.547112462006079
0.547112462006079
0.5349544072948328
0.5258358662613982
0.5562310030395137
0.5379939209726444
0.5562310030395137
0.513677811550152
0.5075987841945289
0.5440729483282675
0.53191

As you can see, we're getting very poor results by trying to train word vectors on our fairly small dataset. No SVD reduction produces results hardly any better than baseline. Next, we'll try word2vec.