In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_validate
from pathlib import Path
pd.set_option('display.max_rows', 100)


In [2]:
dialogpath = Path('/Users/michelleschlacks/Desktop/movie_dialogue.tsv')
chars = pd.read_csv(dialogpath, sep = '\t')
chars = chars.sample(frac = 1.0)
                  
chars.head()


Unnamed: 0,mid,cid,cname,mname,gender,wordcount,year,genres,comedy,thriller,drama,romance,lines
2342,m533,u7883,JACKIE,stepmom,f,3037,1998,"['comedy', 'drama']",True,False,True,False,Good. That's very good. / Don't be. I'm goin...
2814,m70,u1070,BONES,get shorty,m,242,1995,"['comedy', 'crime', 'thriller']",True,True,False,False,You say look at you? / What? / Where's Leo Dev...
2102,m489,u7245,HAN,star wars: episode vi - return of the jedi,m,403,1983,"['action', 'adventure', 'fantasy', 'sci-fi']",False,False,False,False,Who are you? / Where am I? / I can't see. / So...
591,m203,u3104,KAY,the godfather,f,377,1972,"['crime', 'drama', 'thriller']",False,True,True,False,Will you give this letter to Michael. / Will y...
837,m247,u3742,AGENT,apocalypse now,m,282,1979,"['drama', 'war']",False,False,True,False,I don't know what you're talking about -- Get ...


In [4]:
chars = chars.loc[~pd.isnull(chars['lines']), : ]
chars = chars.reset_index(drop = True)  # try commenting this line out, and see how it
                                          # makes your task more difficult
chars.shape

(2969, 13)

In [5]:
chars['lines'].value_counts()

Afraid of lions. / Underneath.  Somewhere. / Once... / Three years I've worked for the railroad. Now I don't know why. It seemed a good idea once. / He doesn't. He needs nobody. But we have hunted many times...  ...he knows I am afraid of lions... / Since his beard was red. / Oh yes. But it takes time. / Soon. / You give me hope, John. / Do you love her? / Good news? / For you. / Oh yes, I think so. / We should construct thorn fences around every tent area. Fires burning at night. / You don't know what Tsavo means, do you?  It means "slaughter"... / You- they cannot believe you're still here. / I am also liaison between these two. / Did it look like this in your mind? / Get on? They detest each other. Obviously the Africans hate the Indians. But the Indians also hate the other Indians. Some of them worship cows, while others eat them. / Thank you. The truth is this: you have to work at it constantly. / Because they are here.  Because Tsavo is the worst place in the world.  Come, John- 

In [7]:
vectorizer = CountVectorizer(max_features = 4000)
sparse_wordcounts = vectorizer.fit_transform(chars.lines)
wordcounts = sparse_wordcounts.toarray()
charwords = pd.DataFrame(wordcounts, columns = vectorizer.get_feature_names())
charwords.head()

Unnamed: 0,000,10,100,11,12,14,15,18,20,24,...,your,yours,yourself,yourselves,youth,yuh,yup,zero,zone,zoo
0,0,2,0,0,0,0,0,0,1,0,...,25,0,2,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,4,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,5,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,6,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,5,0,0,0,0,0,0,0,0,0


In [26]:
keepers = []
indeces = [] #zero = comedy, one = action movie
for idx, row in chars.iterrows():
    if 'action' in row.genres and 'comedy' in row.genres:
        continue
    if 'action' in row.genres or 'comedy' in row.genres:
        keepers.append(idx)
        if 'action' in row.genres:
            indeces.append(1)
        else:
            indeces.append(0)
            
#print(indeces)
sum_of_action = 0
sum_of_comedy = 0
for i in indeces:
    if i == 0:
        sum_of_comedy += 1
    else:
        sum_of_action += 1
        
print(sum_of_action)
print(sum_of_comedy)
for i in keepers:
    print(chars.loc[[i]])

542
756
    mid    cid   cname    mname gender  wordcount  year               genres  \
0  m533  u7883  JACKIE  stepmom      f       3037  1998  ['comedy', 'drama']   

   comedy  thriller  drama  romance  \
0    True     False   True    False   

                                               lines  
0  Good.  That's very good. / Don't be.  I'm goin...  
   mid    cid  cname       mname gender  wordcount  year  \
1  m70  u1070  BONES  get shorty      m        242  1995   

                            genres  comedy  thriller  drama  romance  \
1  ['comedy', 'crime', 'thriller']    True      True  False    False   

                                               lines  
1  You say look at you? / What? / Where's Leo Dev...  
    mid    cid cname                                       mname gender  \
2  m489  u7245   HAN  star wars: episode vi - return of the jedi      m   

   wordcount  year                                        genres  comedy  \
2        403  1983  ['action', 'adventu

166  When did you two talk this over? / Well -- whe...  
      mid    cid       cname         mname gender  wordcount  year  \
167  m186  u2828  CARRUTHERS  smokin' aces      m        281  2006   

                                       genres  comedy  thriller  drama  \
167  ['action', 'crime', 'drama', 'thriller']   False      True   True   

     romance                                              lines  
167    False  Yeah. / Mortal. / No.  I'm going over there. Y...  
     mid   cid   cname             mname gender  wordcount  year  \
170  m19  u320  PARKER  american outlaws      m        265  2001   

                    genres  comedy  thriller  drama  romance  \
170  ['action', 'western']   False     False  False    False   

                                                 lines  
170  There's only four of them... / How can that be...  
      mid    cid cname                         mname gender  wordcount  year  \
171  m217  u3283   TED  there's something about mary      m  

337  Aloha, Spicoli. / Don't worry, Spicoli. You'll...  
      mid    cid cname              mname gender  wordcount  year      genres  \
338  m605  u8893   DAD  who's your daddy?      m        317  2003  ['comedy']   

     comedy  thriller  drama  romance  \
338    True     False  False    False   

                                                 lines  
338  We think you should give up the magazine. Sell...  
      mid    cid cname     mname gender  wordcount  year               genres  \
339  m537  u7936  JEFF  suburbia      m       2571  1996  ['comedy', 'drama']   

     comedy  thriller  drama  romance  \
339    True     False   True    False   

                                                 lines  
339  You know, like, like, I would always think, uh...  
      mid    cid   cname                     mname gender  wordcount  year  \
340  m196  u2997  PICARD  star trek: first contact      m       1166  1996   

                                            genres  comedy  thrill

453  Yes, sir. / Uh, oh. / What are we going to do?...  
      mid    cid cname          mname gender  wordcount  year  \
455  m411  u6168   LEX  jurassic park      f        246  1993   

                                                genres  comedy  thriller  \
455  ['action', 'adventure', 'family', 'sci-fi', 'a...   False     False   

     drama  romance                                              lines  
455  False    False  Phone security systems, everything works.  You...  
      mid    cid     cname                     mname gender  wordcount  year  \
456  m452  u6774  JULIANNE  my best friend's wedding      f       2014  1997   

                    genres  comedy  thriller  drama  romance  \
456  ['comedy', 'romance']    True     False  False     True   

                                                 lines  
456  I'll think about it.  I'm okay. / I did what I...  
      mid    cid   cname                  mname gender  wordcount  year  \
457  m244  u3710  MONICA  the anni

624   True     True  I'm all right. / Hey, can we go to McDonald's?...  
    mid  cid   cname              mname gender  wordcount  year  \
628  m5  u85  LEELOO  the fifth element      f        233  1997   

                                                genres  comedy  thriller  \
628  ['action', 'adventure', 'romance', 'sci-fi', '...   False      True   

     drama  romance                                              lines  
628  False     True  Akta dedero ansila do mektet. / Vano da, mecht...  
      mid    cid cname        mname gender  wordcount  year  \
632  m266  u3998   EVE  being there      f       1371  1979   

                  genres  comedy  thriller  drama  romance  \
632  ['drama', 'comedy']    True     False   True    False   

                                                 lines  
632  I do - we do - both of us, Ben and I feel so m...  
      mid    cid  cname                  mname gender  wordcount  year  \
641  m244  u3703  CLAIR  the anniversary party      f

825  Yes!  But don't hold that against me. I'm a li...  
      mid    cid   cname           mname gender  wordcount  year      genres  \
829  m117  u1772  MARGOT  legally blonde      f        252  2001  ['comedy']   

     comedy  thriller  drama  romance  \
829    True     False  False    False   

                                                 lines  
829  Hello! You're like, a lawyer. / Speaking of wh...  
      mid    cid      cname mname gender  wordcount  year  \
833  m584  u8609  DILLINGER  tron      m        260  1982   

                                                genres  comedy  thriller  \
833  ['action', 'adventure', 'sci-fi', 'thriller', ...   False      True   

     drama  romance                                              lines  
833  False    False  Ah. Sounds good. Well, we should have you runn...  
      mid    cid cname          mname gender  wordcount  year  \
834  m246  u3727   BUD  the apartment      M       4642  1960   

                             gen

985     True  Well, then, I know it takes three -- four week...  
    mid  cid   cname       mname gender  wordcount  year  \
987  m2  u26  CUTLER  15 minutes      m        219  2001   

                                       genres  comedy  thriller  drama  \
987  ['action', 'crime', 'drama', 'thriller']   False      True   True   

     romance                                              lines  
987    False  Officers, there's your killer, do your duty, a...  
     mid   cid   cname                    mname gender  wordcount  year  \
988  m12  u180  ELAINE  airplane ii: the sequel      f        589  1982   

                              genres  comedy  thriller  drama  romance  \
988  ['comedy', 'romance', 'sci-fi']    True     False  False     True   

                                                 lines  
988  Ted, the lever! / Compute! / Set! / Not when I...  
      mid    cid     cname    mname gender  wordcount  year  \
989  m329  u4952  REYNOLDS  ed wood      m        182  

1147  That was my first instinct too. Or D.W.I. Anyt...  
       mid    cid    cname       mname gender  wordcount  year  \
1151  m444  u6657  RAYMOND  moonstruck      m        348  1987   

                              genres  comedy  thriller  drama  romance  \
1151  ['comedy', 'romance', 'drama']    True     False   True     True   

                                                  lines  
1151  No. You were there. / I never seen anybody so ...  
      mid   cid  cname                         mname gender  wordcount  year  \
1155  m59  u915  STACY  fast times at ridgemont high      f       1408  1982   

                              genres  comedy  thriller  drama  romance  \
1155  ['comedy', 'drama', 'romance']    True     False   True     True   

                                                  lines  
1155  No. / Brad. Please don't tell Mom and Dad... /...  
       mid    cid  cname                  mname gender  wordcount  year  \
1156  m244  u3713  SALLY  the anniversary p

1350  And this is mine. / You took Bill. / Okay. / A...  
      mid   cid   cname           mname gender  wordcount  year  \
1357  m24  u393  DEBBIE  bachelor party      f        428  1984   

                     genres  comedy  thriller  drama  romance  \
1357  ['comedy', 'romance']    True     False  False     True   

                                                  lines  
1357  Right in here. The big show starts in one minu...  
       mid    cid     cname             mname gender  wordcount  year  \
1358  m214  u3238  OVERLORD  the time machine      M        421  2002   

                                 genres  comedy  thriller  drama  romance  \
1358  ['sci-fi', 'adventure', 'action']   False     False  False    False   

                                                  lines  
1358  And why not? / But you've earned a reward for ...  
       mid    cid cname     mname gender  wordcount  year  \
1359  m323  u4843  HANS  die hard      m        589  1988   

                   

1522    False  <i>Promise me... you won't tell her... that I ...  
       mid    cid              cname  \
1523  m536  u7928  PRESIDENT MUFFLEY   

                                                  mname gender  wordcount  \
1523  dr. strangelove or: how i learned to stop worr...      M        757   

      year               genres  comedy  thriller  drama  romance  \
1523  1964  ['comedy', 'drama']    True     False   True    False   

                                                  lines  
1523  Oh, yes, that's right. / Goodbye, Mister Ambas...  
       mid    cid  cname                   mname gender  wordcount  year  \
1526  m380  u5737  DAVID  hannah and her sisters      m        243  1986   

                              genres  comedy  thriller  drama  romance  \
1526  ['comedy', 'drama', 'romance']    True     False   True     True   

                                                  lines  
1526  And then, uh, April...huh? / Uh, who gets drop...  
       mid    cid  cname

1679    False  Gillespie, put in call for retrieval. We're of...  
       mid    cid     cname mname gender  wordcount  year  \
1680  m263  u3965  GRIERSON  bean      m        558  1997   

                    genres  comedy  thriller  drama  romance  \
1680  ['comedy', 'family']    True     False  False    False   

                                                  lines  
1680  Good point Bernie - precisely the kind of perc...  
       mid    cid  cname           mname gender  wordcount  year  \
1684  m261  u3935  CHASE  batman forever      f        624  1995   

                                          genres  comedy  thriller  drama  \
1684  ['action', 'crime', 'fantasy', 'thriller']   False      True  False   

      romance                                              lines  
1684    False  Did Two-Face call him Bruce? / Maybe you just ...  
       mid    cid  cname           mname gender  wordcount  year  \
1685  m285  u4274  ERNIE  broadcast news      m        327  1987   

  

1831  False     True  Then why is there fear behind your eyes? / I d...  
       mid    cid   cname                       mname gender  wordcount  year  \
1832  m351  u5296  BREWER  rambo: first blood part ii      M        564  1985   

                                   genres  comedy  thriller  drama  romance  \
1832  ['action', 'adventure', 'thriller']   False      True  False    False   

                                                  lines  
1832  ... and there's this guy with a black helmet a...  
       mid    cid  cname        mname gender  wordcount  year  \
1838  m472  u7057  JOHNS  pitch black      m       1265  2000   

                                genres  comedy  thriller  drama  romance  \
1838  ['action', 'sci-fi', 'thriller']   False      True  False    False   

                                                  lines  
1838  How much you weigh right now, Fry? Huh? / -- t...  
       mid    cid cname      mname gender  wordcount  year  \
1841  m149  u2312  LEON  n

1978  Shit, that was no big deal. / Forty-two-hundre...  
       mid    cid cname             mname gender  wordcount  year  \
1981  m163  u2521  LENA  punch-drunk love      f        904  2002   

                              genres  comedy  thriller  drama  romance  \
1981  ['comedy', 'drama', 'romance']    True     False   True     True   

                                                  lines  
1981  So here we go. / You can't do that. / You left...  
       mid    cid    cname      mname gender  wordcount  year  \
1982  m450  u6749  JEFFREY  my girl 2      m        212  1994   

                                        genres  comedy  thriller  drama  \
1982  ['comedy', 'drama', 'family', 'romance']    True     False   True   

      romance                                              lines  
1982     True  She didn't wanna miss out on anything...especi...  
       mid    cid cname                    mname gender  wordcount  year  \
1984  m195  u2971  ANIJ  star trek: insurrecti

2128  Hank was always good with the ladies. Always g...  
      mid   cid cname            mname gender  wordcount  year  \
2133  m54  u839   AUD  erik the viking      f        385  1989   

                                  genres  comedy  thriller  drama  romance  \
2133  ['comedy', 'adventure', 'fantasy']    True     False  False    False   

                                                  lines  
2133  It's sinking! Hy-Brasil is sinking! / But, Fat...  
       mid    cid cname      mname gender  wordcount  year  \
2134  m142  u2202  BENI  the mummy      m        336  1999   

                                  genres  comedy  thriller  drama  romance  \
2134  ['action', 'adventure', 'fantasy']   False     False  False    False   

                                                  lines  
2134  Something about bringing his dead girly-friend...  
      mid   cid  cname            mname gender  wordcount  year  \
2137  m54  u846  HELGA  erik the viking      f        226  1989   

   

2287  WHEN SHE'S SAFE! / You do, and the spring thaw...  
       mid    cid   cname          mname gender  wordcount  year  \
2288  m246  u3732  MARGIE  the apartment      f        257  1960   

                              genres  comedy  thriller  drama  romance  \
2288  ['romance', 'comedy', 'drama']    True     False   True     True   

                                                  lines  
2288  You bet I will. And when I tell him how you tr...  
       mid    cid cname         mname gender  wordcount  year  \
2290  m462  u6906   MAX  notting hill      m        274  1999   

                     genres  comedy  thriller  drama  romance  \
2290  ['comedy', 'romance']    True     False  False     True   

                                                  lines  
2290  No! / Now you're lying. / I'm delighted. / I'm...  
       mid    cid  cname        mname gender  wordcount  year  \
2296  m598  u8810  BREAN  wag the dog      m       3555  1997   

                   genres  come

2436     True  Betty! Listen to me! Del is ... / Do what? / B...  
       mid    cid  cname      mname gender  wordcount  year  \
2441  m585  u8624  HELEN  true lies      f       1008  1994   

                      genres  comedy  thriller  drama  romance  \
2441  ['action', 'thriller']   False      True  False    False   

                                                  lines  
2441  Will you shutup.  I should never have told you...  
       mid    cid  cname             mname gender  wordcount  year  \
2444  m522  u7722  SUGAR  some like it hot      f       2567  1959   

          genres  comedy  thriller  drama  romance  \
2444  ['comedy']    True     False  False    False   

                                                  lines  
2444  He's going to South America to marry some othe...  
       mid    cid   cname  mname gender  wordcount  year               genres  \
2447  m519  u7686  VINNIE  smoke      m        215  1995  ['comedy', 'drama']   

      comedy  thriller  dram

2614   True    False  What are they doing? / It was in Latin, sir. /...  
      mid    cid  cname             mname gender  wordcount  year  \
2616  m97  u1440  MARTY  independence day      m        133  1996   

                                             genres  comedy  thriller  drama  \
2616  ['action', 'adventure', 'sci-fi', 'thriller']   False      True  False   

      romance                                              lines  
2616    False  Then what? / A countdown to what? / Not really...  
       mid    cid cname            mname gender  wordcount  year  \
2618  m126  u1928  LARA  minority report      f        325  2002   

                                                 genres  comedy  thriller  \
2618  ['action', 'crime', 'mystery', 'sci-fi', 'thri...   False      True   

      drama  romance                                              lines  
2618   True    False  John?  What is it? / I know... I do, too... / ...  
       mid    cid cname               mname gender  

2808  Now hold on, Baxter -- / What's gotten into yo...  
      mid    cid  cname       mname gender  wordcount  year  \
2809  m89  u1345  MORAN  highlander      m        528  1986   

                     genres  comedy  thriller  drama  romance  \
2809  ['action', 'fantasy']   False     False  False    False   

                                                  lines  
2809  Don't be stupid, lady.  Your neck can be slice...  
       mid    cid   cname   mname gender  wordcount  year  \
2810  m355  u5345  FLETCH  fletch      m       3180  1985   

                              genres  comedy  thriller  drama  romance  \
2810  ['comedy', 'crime', 'mystery']    True     False  False    False   

                                                  lines  
2810  Comanche Indian. / God I admire you. / I feel ...  
       mid    cid       cname      mname gender  wordcount  year  \
2811  m579  u8540  MRS. DAVIS  toy story      f        177  1995   

                                           

In [None]:
title_basics = pd.read_csv('/Users/michelleschlacks/Desktop/final project IS417/title.basics.tsv')
title_ratings = pd.read_csv('/Users/michelleschlacks/Desktop/final project IS417/title.ratings.tsv')

In [None]:
title_basics