In [88]:
import numpy as np
import pandas as pd

In [89]:
movie_lines_features = ["LineID", "Character", "Movie", "Name", "Line"]
movie_lines = pd.read_csv("movie_lines.txt", sep = "\+\+\+\$\+\+\+", engine = "python", index_col = False, names = movie_lines_features)
movie_lines.head()

Unnamed: 0,LineID,Character,Movie,Name,Line
0,L1045,u0,m0,BIANCA,They do not!
1,L1044,u2,m0,CAMERON,They do to!
2,L985,u0,m0,BIANCA,I hope so.
3,L984,u2,m0,CAMERON,She okay?
4,L925,u0,m0,BIANCA,Let's go.


In [90]:
movie_characters_features = ["Character", "Name", "Movie", "Title", "Gender",'Position']
movie_characters = pd.read_csv("movie_characters_metadata.txt", sep = "\+\+\+\$\+\+\+", engine = "python", index_col = False, names = movie_characters_features)
movie_characters = movie_characters[['Character', 'Title', 'Gender']]
movie_characters.head()

Unnamed: 0,Character,Title,Gender
0,u0,10 things i hate about you,f
1,u1,10 things i hate about you,?
2,u2,10 things i hate about you,m
3,u3,10 things i hate about you,?
4,u4,10 things i hate about you,m


In [91]:
movie_lines.Character = movie_lines.Character.str.strip()
movie_characters.Character = movie_characters.Character.str.strip()
character_lines = movie_characters.merge(movie_lines, left_on='Character', right_on='Character')
character_lines.head()

Unnamed: 0,Character,Title,Gender,LineID,Movie,Name,Line
0,u0,10 things i hate about you,f,L1045,m0,BIANCA,They do not!
1,u0,10 things i hate about you,f,L985,m0,BIANCA,I hope so.
2,u0,10 things i hate about you,f,L925,m0,BIANCA,Let's go.
3,u0,10 things i hate about you,f,L872,m0,BIANCA,Okay -- you're gonna need to learn how to lie.
4,u0,10 things i hate about you,f,L870,m0,BIANCA,I'm kidding. You know how sometimes you just...


In [92]:
gender_filter_f = (character_lines['Gender'].str.strip() == 'f')
character_lines_f = character_lines.loc[gender_filter_f, ['Gender','Line']]
character_lines_f.head()

Unnamed: 0,Gender,Line
0,f,They do not!
1,f,I hope so.
2,f,Let's go.
3,f,Okay -- you're gonna need to learn how to lie.
4,f,I'm kidding. You know how sometimes you just...


In [93]:
gender_filter_m = (character_lines['Gender'].str.strip() == 'm')
character_lines_m = character_lines.loc[gender_filter_m, ['Gender','Line']]
character_lines_m.head()

Unnamed: 0,Gender,Line
101,m,They do to!
102,m,She okay?
103,m,Wow
104,m,No
105,m,"The ""real you""."


In [94]:
character_lines_m_s = character_lines_m.head(66228)
len(character_lines_m_s)

66228

In [95]:
character_lines_fm = character_lines_f.add(character_lines_m_s, fill_value='')
len(character_lines_fm)

132456

In [96]:
character_lines_fm.head()

Unnamed: 0,Gender,Line
0,f,They do not!
1,f,I hope so.
2,f,Let's go.
3,f,Okay -- you're gonna need to learn how to lie.
4,f,I'm kidding. You know how sometimes you just...


In [97]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [98]:
character_lines_fm['Line'][0]

' They do not!'

In [99]:
character_lines_fm = character_lines_fm.astype(str)

In [100]:
character_lines_fm.head()

Unnamed: 0,Gender,Line
0,f,They do not!
1,f,I hope so.
2,f,Let's go.
3,f,Okay -- you're gonna need to learn how to lie.
4,f,I'm kidding. You know how sometimes you just...


In [101]:
gender_filter_minlength = (character_lines['Line'].str.len() > 110)
character_lines_fm = character_lines_fm[gender_filter_minlength]
len(character_lines_fm)

  


15397

In [102]:
def checkLemma(text,lemma):
    if(lemma == '-PRON-'):
        return text
    else:
        return lemma

In [103]:
character_lines_fm["LemLine"] = character_lines_fm["Line"].apply(lambda row: " ".join([checkLemma(w.text, w.lemma_) for w in nlp(row)]))

In [104]:
character_lines_fm.head()

Unnamed: 0,Gender,Line,LemLine
17,f,Lesbian? No. I found a picture of Jared Leto...,Lesbian ? no . I find a picture of Jared L...
25,f,I don't want to know how to say that though. ...,I do not want to know how to say that though...
28,f,Unsolved mystery. She used to be really popu...,unsolved mystery . She use to be really po...
29,f,"The thing is, Cameron -- I'm at the mercy of ...","the thing be , Cameron -- I be at the mercy ..."
33,f,Can we make this quick? Roxanne Korrine and ...,Can we make this quick ? Roxanne Korrine a...


In [105]:
character_lines_fm.isnull().sum()

Gender     0
Line       0
LemLine    0
dtype: int64

In [106]:
from sklearn.model_selection import train_test_split

In [107]:
X = character_lines_fm['LemLine']
y = character_lines_fm['Gender']

In [108]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=30)

In [109]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

In [110]:
text_clf = Pipeline([('tfidf', TfidfVectorizer()), ('clf',LinearSVC())])
text_clf_naive = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')), ('clf',MultinomialNB())])
text_clf_log = Pipeline([('tfidf', TfidfVectorizer()), ('clf',LogisticRegression())])

In [111]:
text_clf.fit(X_train, y_train)
text_clf_naive.fit(X_train, y_train)
text_clf_log.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling

In [112]:
predictions = text_clf.predict(X_test)
predictions_naive = text_clf_naive.predict(X_test)
predictions_log = text_clf_log.predict(X_test)

In [113]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [114]:
print('Linear Support Vector Classification:')
print(confusion_matrix(y_test,predictions),'\n')
print('Naive Bayes:')
print(confusion_matrix(y_test,predictions_naive),'\n')
print('Logistic regression:')
print(confusion_matrix(y_test,predictions_log))

Linear Support Vector Classification:
[[1280  858]
 [ 795 1687]] 

Naive Bayes:
[[1092 1046]
 [ 541 1941]] 

Logistic regression:
[[1155  983]
 [ 683 1799]]


In [115]:
print('Linear Support Vector Classification:')
print(classification_report(y_test,predictions),'\n')
print('Naive Bayes:')
print(classification_report(y_test,predictions_naive),'\n')
print('Logistic regression:')
print(classification_report(y_test,predictions_log),'\n')

Linear Support Vector Classification:
              precision    recall  f1-score   support

          f        0.62      0.60      0.61      2138
          m        0.66      0.68      0.67      2482

    accuracy                           0.64      4620
   macro avg       0.64      0.64      0.64      4620
weighted avg       0.64      0.64      0.64      4620
 

Naive Bayes:
              precision    recall  f1-score   support

          f        0.67      0.51      0.58      2138
          m        0.65      0.78      0.71      2482

    accuracy                           0.66      4620
   macro avg       0.66      0.65      0.64      4620
weighted avg       0.66      0.66      0.65      4620
 

Logistic regression:
              precision    recall  f1-score   support

          f        0.63      0.54      0.58      2138
          m        0.65      0.72      0.68      2482

    accuracy                           0.64      4620
   macro avg       0.64      0.63      0.63      462

In [116]:
print('Linear Support Vector Classification:')
print(accuracy_score(y_test,predictions),'\n')
print('Naive Bayes:')
print(accuracy_score(y_test,predictions_naive),'\n')
print('Logistic regression:')
print(accuracy_score(y_test,predictions_log),'\n')

Linear Support Vector Classification:
0.6422077922077922 

Naive Bayes:
0.6564935064935065 

Logistic regression:
0.6393939393939394 



# Lemmatization konnte eine minimale Verbesserung erzielen