In [668]:
import numpy as np
import pandas as pd

In [669]:
movie_lines_features = ["LineID", "Character", "Movie", "Name", "Line"]
movie_lines = pd.read_csv("movie_lines.txt", sep = "\+\+\+\$\+\+\+", engine = "python", index_col = False, names = movie_lines_features)
movie_lines.head()

Unnamed: 0,LineID,Character,Movie,Name,Line
0,L1045,u0,m0,BIANCA,They do not!
1,L1044,u2,m0,CAMERON,They do to!
2,L985,u0,m0,BIANCA,I hope so.
3,L984,u2,m0,CAMERON,She okay?
4,L925,u0,m0,BIANCA,Let's go.


In [670]:
movie_characters_features = ["Character", "Name", "Movie", "Title", "Gender",'Position']
movie_characters = pd.read_csv("movie_characters_metadata.txt", sep = "\+\+\+\$\+\+\+", engine = "python", index_col = False, names = movie_characters_features)
movie_characters = movie_characters[['Character', 'Title', 'Gender']]
movie_characters.head()

Unnamed: 0,Character,Title,Gender
0,u0,10 things i hate about you,f
1,u1,10 things i hate about you,?
2,u2,10 things i hate about you,m
3,u3,10 things i hate about you,?
4,u4,10 things i hate about you,m


In [671]:
movie_lines.Character = movie_lines.Character.str.strip()
movie_characters.Character = movie_characters.Character.str.strip()
character_lines = movie_characters.merge(movie_lines, left_on='Character', right_on='Character')
character_lines.head()

Unnamed: 0,Character,Title,Gender,LineID,Movie,Name,Line
0,u0,10 things i hate about you,f,L1045,m0,BIANCA,They do not!
1,u0,10 things i hate about you,f,L985,m0,BIANCA,I hope so.
2,u0,10 things i hate about you,f,L925,m0,BIANCA,Let's go.
3,u0,10 things i hate about you,f,L872,m0,BIANCA,Okay -- you're gonna need to learn how to lie.
4,u0,10 things i hate about you,f,L870,m0,BIANCA,I'm kidding. You know how sometimes you just...


In [672]:
gender_filter_f = (character_lines['Gender'].str.strip() == 'f')
character_lines_f = character_lines.loc[gender_filter_f, ['Gender','Line']]
character_lines_f.head()

Unnamed: 0,Gender,Line
0,f,They do not!
1,f,I hope so.
2,f,Let's go.
3,f,Okay -- you're gonna need to learn how to lie.
4,f,I'm kidding. You know how sometimes you just...


In [673]:
len(character_lines_f)

66228

In [674]:
gender_filter_m = (character_lines['Gender'].str.strip() == 'm')
character_lines_m = character_lines.loc[gender_filter_m, ['Gender','Line']]
character_lines_m.head()

Unnamed: 0,Gender,Line
101,m,They do to!
102,m,She okay?
103,m,Wow
104,m,No
105,m,"The ""real you""."


In [675]:
len(character_lines_m)

154133

# Mehr Lines von Männer als von Frauen

# Untersuchung ob die Werte der Predition genauso gut sind wenn mit einem Verhältnis von 50/50 gearbeitet wird


In [676]:
character_lines_m_s = character_lines_m.head(66228)
len(character_lines_m_s)

66228

In [677]:
character_lines_m_s.head()

Unnamed: 0,Gender,Line
101,m,They do to!
102,m,She okay?
103,m,Wow
104,m,No
105,m,"The ""real you""."


In [678]:
character_lines_f.head()

Unnamed: 0,Gender,Line
0,f,They do not!
1,f,I hope so.
2,f,Let's go.
3,f,Okay -- you're gonna need to learn how to lie.
4,f,I'm kidding. You know how sometimes you just...


In [679]:
character_lines_fm = character_lines_f.add(character_lines_m_s, fill_value='')
len(character_lines_fm)

132456

In [680]:
character_lines_fm.head()
len(character_lines_fm)

132456

In [681]:
gender_filter_minlength = (character_lines['Line'].str.len() > 110)
character_lines_fm = character_lines_fm[gender_filter_minlength]
len(character_lines_fm)

  


15397

In [682]:
character_lines_fm.isnull().sum()

Gender    0
Line      0
dtype: int64

In [683]:
character_lines_fm.dropna(inplace=True)

In [684]:
from sklearn.model_selection import train_test_split

In [685]:
X = character_lines_fm['Line']
y = character_lines_fm['Gender']

In [686]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=30)

In [687]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

In [688]:
text_clf = Pipeline([('tfidf', TfidfVectorizer()), ('clf',LinearSVC())])

In [689]:
text_clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
               

In [690]:
predictions = text_clf.predict(X_test)

In [691]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [692]:
print(confusion_matrix(y_test,predictions))

[[1282  856]
 [ 811 1671]]


In [693]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

          f        0.61      0.60      0.61      2138
          m        0.66      0.67      0.67      2482

    accuracy                           0.64      4620
   macro avg       0.64      0.64      0.64      4620
weighted avg       0.64      0.64      0.64      4620



In [694]:
print(accuracy_score(y_test,predictions))

0.6391774891774892


# Modell schlechter jedoch bessere Werte bei der Erkennung von Frauen