In [1]:
import re
import string
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [2]:
data = pd.read_csv('gender-classifier-DFE-791531.csv',encoding='latin-1')  
print(data.shape)
data = pd.concat([data.gender, data.text,data.description], axis=1)
data.head()

(20050, 26)


Unnamed: 0,gender,text,description
0,male,Robbie E Responds To Critics After Win Against...,i sing my own rhythm.
1,male,ÛÏIt felt like they were my friends and I was...,I'm the author of novels filled with family dr...
2,male,i absolutely adore when louis starts the songs...,louis whining and squealing and all
3,male,Hi @JordanSpieth - Looking at the url - do you...,"Mobile guy. 49ers, Shazam, Google, Kleiner Pe..."
4,female,Watching Neighbours on Sky+ catching up with t...,Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T...


In [3]:
data['description'].fillna(" ",inplace=True)
data.dropna(axis = 0, inplace = True) 
data['gender']=data[(data['gender'] == "female") | (data['gender'] == "male")] 
data['gender'].value_counts()

female    6700
male      6194
Name: gender, dtype: int64

In [4]:
data.gender = [1 if gender == "female" else 0 for gender in data.gender]
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19953 entries, 0 to 20049
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   gender       19953 non-null  int64 
 1   text         19953 non-null  object
 2   description  19953 non-null  object
dtypes: int64(1), object(2)
memory usage: 623.5+ KB


In [5]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download("stopwords")
stop_words=set(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
def process(word):
  word = word.lower()
  word = re.sub(r"http\S+|www\S+|https\S+"," ",word,flags=re.MULTILINE)
  word = word.translate(str.maketrans("","",string.punctuation))
  word = re.sub(r'\@\w+|\#',"",word)
  tokens = word_tokenize(word)
  filtered_words = [word for word in tokens if word not in stop_words]
  ps = PorterStemmer()
  stemmed_words = [ps.stem(word) for word in filtered_words]
  lemmatizer = WordNetLemmatizer()
  lemmatized_words =[lemmatizer.lemmatize(word,pos='a') for word in stemmed_words]
  return " ".join(lemmatized_words)

In [7]:
text_list = []                            
for text in data.text:
  text = process(text)
  text_list.append(text)
description_list = []
for description in data.description:
  description = process(description)
  description_list.append(description) 

In [8]:
max_features = 5000               
cv = CountVectorizer(max_features=max_features)

t = cv.fit_transform(text_list).toarray()
d = cv.fit_transform(description_list).toarray()

In [9]:
text = pd.DataFrame(t)
description = pd.DataFrame(d)
X = pd.concat([description,text],join='outer',axis=1)
x = np.array(X)
y = np.array(data['gender']) 

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.1, random_state = 45)

In [10]:
lr = LogisticRegression(max_iter = 500)
lr.fit(x_train,y_train)

y_pred = lr.predict(x_test)  

In [12]:
accuracy = 100.0 * accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)

Accuracy:  73.79759519038076
