In [52]:
import pandas as pd
import numpy as np

# Prepare The Data

In [3]:
data = pd.read_csv("gender_classifier.csv", encoding="latin1")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20050 entries, 0 to 20049
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   _unit_id               20050 non-null  int64  
 1   _golden                20050 non-null  bool   
 2   _unit_state            20050 non-null  object 
 3   _trusted_judgments     20050 non-null  int64  
 4   _last_judgment_at      20000 non-null  object 
 5   gender                 19953 non-null  object 
 6   gender:confidence      20024 non-null  float64
 7   profile_yn             20050 non-null  object 
 8   profile_yn:confidence  20050 non-null  float64
 9   created                20050 non-null  object 
 10  description            16306 non-null  object 
 11  fav_number             20050 non-null  int64  
 12  gender_gold            50 non-null     object 
 13  link_color             20050 non-null  object 
 14  name                   20050 non-null  object 
 15  pr

In [4]:
dt = pd.concat([data.gender, data.description], axis=1)
dt.head()

Unnamed: 0,gender,description
0,male,i sing my own rhythm.
1,male,I'm the author of novels filled with family dr...
2,male,louis whining and squealing and all
3,male,"Mobile guy. 49ers, Shazam, Google, Kleiner Pe..."
4,female,Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T...


In [5]:
dt.dropna(axis = 0, inplace=True) # axis = 0 => satiri komple siler

In [6]:
#dt.gender = [1 if each == "female" else 0 for each in dt.gender]

In [7]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
dt.gender = le.fit_transform(dt.gender)

In [8]:
print("""
Distribution of {}

1 -> female
2 -> male
0 -> brand
3 -> unknown
""".format(dt.gender.value_counts()))


Distribution of gender
1    5725
2    5469
0    4328
3     702
Name: count, dtype: int64

1 -> female
2 -> male
0 -> brand
3 -> unknown



# Data Clean

In [10]:
# Regular Expression: includes a kind of search pattern algorithms
import re

## Stopwords 
# some words does not necessary to classify our text, such as the, to, as, and.
# Because they do not show or include anything about owner
import nltk #natural language tool kit
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize

#Lemmatizer
import nltk as nlp
nltk.download('wordnet')

lemma = nlp.WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\oguzk\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\oguzk\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:

xx = []
for i in dt.description:
    editted = re.sub("[^a-zA-Z]", " ", i)
    editted_ = editted.lower()
    all_words = nltk.wordpunct_tokenize(editted_) # its better than split() method.
    new_words = [lemma.lemmatize(each) for each in all_words if not each in set(stopwords.words("english"))]
    
    xx.append(" ".join(new_words))




In [13]:
dt.drop(["description"], axis=1, inplace=True)
dt["description"] = xx

dt.head()



Unnamed: 0,gender,description
0,2,sing rhythm
1,2,author novel filled family drama romance
2,2,louis whining squealing
3,2,mobile guy er shazam google kleiner perkins ya...
4,1,ricky wilson best frontman kaiser chief best b...


## Bag of words

It find a summarization of each unique words. 

In [87]:
from sklearn.feature_extraction.text import CountVectorizer #bag of words



count_vectorizers = CountVectorizer(max_df=500, stop_words="english")# it finds most 500 common words

sparce_matrix = count_vectorizers.fit_transform(xx).toarray()



In [88]:
print("The most common 500 words: {}".format(count_vectorizers.get_feature_names_out()))

The most common 500 words: ['aa' 'aaa' 'aacc' ... 'zy' 'zyyfromyigo' 'zz']


# Train - Test Split

In [90]:
x = np.argmax(sparce_matrix, axis=1).reshape(-1,1)
y = dt.gender.values

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=31)


# Create the model

## Naive Bayes

In [92]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(x_train, y_train)


## Prediction

In [94]:
y_head = nb.predict(x_test).reshape(-1,1)

## Accuracy

In [96]:
y_head

array([[2],
       [2],
       [2],
       ...,
       [1],
       [1],
       [2]])

In [97]:
print("Accuracy of our model: ",nb.score(y_head, y_test))

Accuracy of our model:  0.32409118915588414


- do not forget. in this problem we have 4 genders. male, female, brand and unknown