In [124]:
import numpy as np
import pandas as pd

In [125]:
movie_lines_features = ["LineID", "Character", "Movie", "Name", "Line"]
movie_lines = pd.read_csv("movie_lines.txt", sep = "\+\+\+\$\+\+\+", engine = "python", index_col = False, names = movie_lines_features)
movie_lines.head()

Unnamed: 0,LineID,Character,Movie,Name,Line
0,L1045,u0,m0,BIANCA,They do not!
1,L1044,u2,m0,CAMERON,They do to!
2,L985,u0,m0,BIANCA,I hope so.
3,L984,u2,m0,CAMERON,She okay?
4,L925,u0,m0,BIANCA,Let's go.


In [126]:
movie_characters_features = ["Character", "Name", "Movie", "Title", "Gender",'Position']
movie_characters = pd.read_csv("movie_characters_metadata.txt", sep = "\+\+\+\$\+\+\+", engine = "python", index_col = False, names = movie_characters_features)
movie_characters = movie_characters[['Character', 'Title', 'Gender']]
movie_characters.head()

Unnamed: 0,Character,Title,Gender
0,u0,10 things i hate about you,f
1,u1,10 things i hate about you,?
2,u2,10 things i hate about you,m
3,u3,10 things i hate about you,?
4,u4,10 things i hate about you,m


In [127]:
movie_lines.Character = movie_lines.Character.str.strip()
movie_characters.Character = movie_characters.Character.str.strip()
character_lines = movie_characters.merge(movie_lines, left_on='Character', right_on='Character')
character_lines.head()

Unnamed: 0,Character,Title,Gender,LineID,Movie,Name,Line
0,u0,10 things i hate about you,f,L1045,m0,BIANCA,They do not!
1,u0,10 things i hate about you,f,L985,m0,BIANCA,I hope so.
2,u0,10 things i hate about you,f,L925,m0,BIANCA,Let's go.
3,u0,10 things i hate about you,f,L872,m0,BIANCA,Okay -- you're gonna need to learn how to lie.
4,u0,10 things i hate about you,f,L870,m0,BIANCA,I'm kidding. You know how sometimes you just...


In [128]:
len(character_lines)

304713

Filtern: Nur Lines mit bestimmter mindeslänge

In [129]:
gender_filter_f = (character_lines['Gender'].str.strip() == 'f')
character_lines_f = character_lines.loc[gender_filter_f, ['Gender','Line']]
character_lines_f.head()

Unnamed: 0,Gender,Line
0,f,They do not!
1,f,I hope so.
2,f,Let's go.
3,f,Okay -- you're gonna need to learn how to lie.
4,f,I'm kidding. You know how sometimes you just...


In [130]:
gender_filter_m = (character_lines['Gender'].str.strip() == 'm')
character_lines_m = character_lines.loc[gender_filter_m, ['Gender','Line']]
character_lines_m.head()

Unnamed: 0,Gender,Line
101,m,They do to!
102,m,She okay?
103,m,Wow
104,m,No
105,m,"The ""real you""."


In [131]:
len(character_lines_f)

66228

In [132]:
character_lines_m_s = character_lines_m.head(66228)
len(character_lines_m_s)

66228

In [133]:
character_lines_fm = character_lines_f.add(character_lines_m_s, fill_value='')
len(character_lines_fm)

132456

In [134]:
character_lines_fm.head()

Unnamed: 0,Gender,Line
0,f,They do not!
1,f,I hope so.
2,f,Let's go.
3,f,Okay -- you're gonna need to learn how to lie.
4,f,I'm kidding. You know how sometimes you just...


In [135]:
character_lines_fm = character_lines_fm.astype(str)

In [136]:
# remove punctuation
def puncAndLower(text):
    newLine = text.str.replace('[^\w\s]','')
    return newLine.str.lower()

In [137]:
import re

In [138]:
character_lines_fm["LineWithoutPunctuation"] = puncAndLower(character_lines_fm['Line'])

In [139]:
character_lines_fm.head()

Unnamed: 0,Gender,Line,LineWithoutPunctuation
0,f,They do not!,they do not
1,f,I hope so.,i hope so
2,f,Let's go.,lets go
3,f,Okay -- you're gonna need to learn how to lie.,okay youre gonna need to learn how to lie
4,f,I'm kidding. You know how sometimes you just...,im kidding you know how sometimes you just b...


In [140]:
gender_filter_minlength = (character_lines['Line'].str.len() > 110)
character_lines_fm = character_lines_fm[gender_filter_minlength]
len(character_lines_fm)

  


15397

In [141]:
character_lines_fm.isnull().sum()

Gender                    0
Line                      0
LineWithoutPunctuation    0
dtype: int64

In [142]:
from sklearn.model_selection import train_test_split

In [143]:
X = character_lines_fm['LineWithoutPunctuation']
y = character_lines_fm['Gender']

In [144]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=30)

In [145]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

In [146]:
text_clf = Pipeline([('tfidf', TfidfVectorizer()), ('clf',LinearSVC())])
text_clf_naive = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')), ('clf',MultinomialNB())])
text_clf_log = Pipeline([('tfidf', TfidfVectorizer()), ('clf',LogisticRegression())])

In [147]:
text_clf.fit(X_train, y_train)
text_clf_naive.fit(X_train, y_train)
text_clf_log.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling

In [148]:
predictions = text_clf.predict(X_test)
predictions_naive = text_clf_naive.predict(X_test)
predictions_log = text_clf_log.predict(X_test)

In [149]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [150]:
print('Linear Support Vector Classification:')
print(confusion_matrix(y_test,predictions),'\n')
print('Naive Bayes:')
print(confusion_matrix(y_test,predictions_naive),'\n')
print('Logistic regression:')
print(confusion_matrix(y_test,predictions_log))

Linear Support Vector Classification:
[[1247  891]
 [ 805 1677]] 

Naive Bayes:
[[1045 1093]
 [ 522 1960]] 

Logistic regression:
[[1154  984]
 [ 665 1817]]


In [151]:
print('Linear Support Vector Classification:')
print(classification_report(y_test,predictions),'\n')
print('Naive Bayes:')
print(classification_report(y_test,predictions_naive),'\n')
print('Logistic regression:')
print(classification_report(y_test,predictions_log),'\n')

Linear Support Vector Classification:
              precision    recall  f1-score   support

          f        0.61      0.58      0.60      2138
          m        0.65      0.68      0.66      2482

    accuracy                           0.63      4620
   macro avg       0.63      0.63      0.63      4620
weighted avg       0.63      0.63      0.63      4620
 

Naive Bayes:
              precision    recall  f1-score   support

          f        0.67      0.49      0.56      2138
          m        0.64      0.79      0.71      2482

    accuracy                           0.65      4620
   macro avg       0.65      0.64      0.64      4620
weighted avg       0.65      0.65      0.64      4620
 

Logistic regression:
              precision    recall  f1-score   support

          f        0.63      0.54      0.58      2138
          m        0.65      0.73      0.69      2482

    accuracy                           0.64      4620
   macro avg       0.64      0.64      0.64      462

In [152]:
print('Linear Support Vector Classification:')
print(accuracy_score(y_test,predictions),'\n')
print('Naive Bayes:')
print(accuracy_score(y_test,predictions_naive),'\n')
print('Logistic regression:')
print(accuracy_score(y_test,predictions_log),'\n')

Linear Support Vector Classification:
0.6329004329004329 

Naive Bayes:
0.6504329004329005 

Logistic regression:
0.643073593073593 



## Durch die Entfernung von Satzzeichen konnte keine Verbesserung erzielt werden