# Objective:
1. Find Accuracy for Pipeline based on CountVectorier and TfIDF


# Accuracy: 0.5900242028833

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Dense, Activation, Dropout, LSTM, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from tensorflow.keras.regularizers import l2
from tensorflow.keras.utils import plot_model
from sklearn.model_selection import train_test_split

In [2]:
print('Reading DataFrame...')

fpath = '/content/mydrive/MyDrive/personal_data/Random_Data/REA/name_gender.csv'
fpath = 'name_gender_clean.csv'
import pandas as pd
df = pd.read_csv(fpath)
print(df)

Reading DataFrame...
       Unnamed: 0       name gender
0               0     casina      F
1               1    crettie      F
2               2      lenia      F
3               3      hasya      F
4               4    euguene      M
...           ...        ...    ...
69437       69437      janea      F
69438       69438      lalit      M
69439       69439   wilberth      M
69440       69440    kevonte      M
69441       69441  amaryllis      F

[69442 rows x 3 columns]


In [3]:
names = df['name']
gender = df['gender']
maxlen = 20
labels = 2

In [4]:
# plt.figure(figsize=(12,8))
# plt.hist([len(a) for a in names], bins=36)
# plt.title("Length of the names")
# # plt.show()

In [5]:
print("Male : " + str(sum(gender=='M')))
print("Female : " + str(sum(gender=='F')))

Male : 34721
Female : 34721


In [6]:
vocab = set(' '.join([str(i) for i in names]))
vocab.add('END')
len_vocab = len(vocab)

In [7]:
print('Indexing...')
char_index = dict((c, i) for i, c in enumerate(vocab))
print(char_index)

Indexing...
{' ': 0, 'p': 1, 'c': 2, 'i': 3, 'j': 4, 'x': 5, 'w': 6, 'r': 7, 'h': 8, 'v': 9, 'END': 10, 'y': 11, 'g': 12, 't': 13, 'm': 14, 'o': 15, 'u': 16, 'q': 17, 'e': 18, 'b': 19, 'l': 20, 'a': 21, 'f': 22, 'n': 23, 'z': 24, 's': 25, 'd': 26, 'k': 27}


In [8]:
X = []
y = []

# Builds an empty line with a 1 at the index of character
def set_flag(i):
    tmp = np.zeros(len_vocab);
    tmp[i] = 1
    return list(tmp)

# Truncate names and create the matrix
def prepare_X(X):
    new_list = []
    trunc_train_name = [str(i)[0:maxlen] for i in X]

    for i in trunc_train_name:
        tmp = [set_flag(char_index[j]) for j in str(i)]
        for k in range(0,maxlen - len(str(i))):
            tmp.append(set_flag(char_index["END"]))
        new_list.append(tmp)

    return new_list


X = prepare_X(names.values)

# Label Encoding of y
def prepare_y(y):
    new_list = []
    for i in y:
        if i == 'M':
            new_list.append([1,0])
        else:
            new_list.append([0,1])

    return new_list

y = prepare_y(gender)

In [9]:
print('Innit sklearn.preprocessing train_test_split...')
X_train, X_test, y_train, y_test = train_test_split(X, y)

Innit sklearn.preprocessing train_test_split...


# Manipulated df (without shuffling)

In [10]:
len(df)

69442

In [11]:
df.head()

Unnamed: 0.1,Unnamed: 0,name,gender
0,0,casina,F
1,1,crettie,F
2,2,lenia,F
3,3,hasya,F
4,4,euguene,M


# custom Train Test Split

In [12]:
print('Innit Custom Train Test Split..')
df_train_ = df[:int(len(df)*0.9)]
df_test_ = df[int(len(df)*0.9):]
df_train_.reset_index(drop=True,inplace=True)
df_test_.reset_index(drop=True,inplace=True)

Innit Custom Train Test Split..


In [13]:
name_list = df_train_['name'].to_list()
name_list[:3]

['casina', 'crettie', 'lenia']

# Count Vectorized and TfIDF

In [14]:
print('Innit Vectorizing CountVectorizer..')

from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(name_list)
X_train_counts
X_train_counts.shape

Innit Vectorizing CountVectorizer..


(62497, 62497)

In [15]:
print('Innit TfidfTransformer..')

from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

Innit TfidfTransformer..


(62497, 62497)

In [16]:
X_train_tf

<62497x62497 sparse matrix of type '<class 'numpy.float64'>'
	with 62497 stored elements in Compressed Sparse Row format>

In [17]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(62497, 62497)

In [18]:
from sklearn.naive_bayes import MultinomialNB
# clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)#orig
mlbclf = MultinomialNB().fit(X_train_tfidf, df_train_.gender)#change

In [19]:
print('Innit predicting MultinomialNB..')

docs_new = df_test_['gender']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

mlbpredicted = mlbclf.predict(X_new_tfidf)
mlbpredicted

Innit predicting MultinomialNB..


array(['M', 'M', 'M', ..., 'M', 'M', 'M'], dtype='<U1')

# Pipeline

In [20]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', MultinomialNB()),
        ])

In [21]:
df.head()

Unnamed: 0.1,Unnamed: 0,name,gender
0,0,casina,F
1,1,crettie,F
2,2,lenia,F
3,3,hasya,F
4,4,euguene,M


In [22]:
text_clf.fit(df_train_.name, df_train_.gender)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB())])

In [23]:
print('Predicting')
import numpy as np
# twenty_test = fetch_20newsgroups(subset='test',
# categories=categories, shuffle=True, random_state=42)
docs_test = df_test_.name
predicted = text_clf.predict(docs_test)
print("predicted:",predicted)

Predicting
predicted: ['M' 'M' 'M' ... 'M' 'M' 'M']


In [25]:
from sklearn.metrics import accuracy_score
print('Final Accuracy Score:', accuracy_score(predicted,df_test_.gender))

Final Accuracy Score: 0.4958963282937365
