# Welcome Basic NLP Tutorial

## Application : Gender Classification by Description

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
#load library
import nltk as nlp
import pandas as pd
import numpy as np

In [3]:
#import data
dataset = pd.read_csv(r"gender-classifier.csv", encoding="latin1")
dataset.head(1)

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,gender,gender:confidence,profile_yn,profile_yn:confidence,created,...,profileimage,retweet_count,sidebar_color,text,tweet_coord,tweet_count,tweet_created,tweet_id,tweet_location,user_timezone
0,815719226,False,finalized,3,10/26/15 23:24,male,1.0,yes,1.0,12/5/13 1:48,...,https://pbs.twimg.com/profile_images/414342229...,0,FFFFFF,Robbie E Responds To Critics After Win Against...,,110964,10/26/15 12:40,6.5873e+17,main; @Kan1shk3,Chennai


In [4]:
dataset.columns

Index(['_unit_id', '_golden', '_unit_state', '_trusted_judgments',
       '_last_judgment_at', 'gender', 'gender:confidence', 'profile_yn',
       'profile_yn:confidence', 'created', 'description', 'fav_number',
       'gender_gold', 'link_color', 'name', 'profile_yn_gold', 'profileimage',
       'retweet_count', 'sidebar_color', 'text', 'tweet_coord', 'tweet_count',
       'tweet_created', 'tweet_id', 'tweet_location', 'user_timezone'],
      dtype='object')

In [5]:
#get gender and description
dataset = dataset [["gender","description"]]
dataset.head()

Unnamed: 0,gender,description
0,male,i sing my own rhythm.
1,male,I'm the author of novels filled with family dr...
2,male,louis whining and squealing and all
3,male,"Mobile guy. 49ers, Shazam, Google, Kleiner Pe..."
4,female,Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T...


In [6]:
#check and clean nan values
print("Nan values count:",dataset.description.isna().sum())
dataset.dropna(axis=0,inplace=True)
print("Cleaned")

Nan values count: 3744
Cleaned


#### TODO
* Transform Gender Feature 
* Select a sample for example
* Clean data from smile,point vs..
* Word tokenize
* Clean data from stopwords
* Get words root (lemmatization)
* Join words
* All process implement all description
* Bag Of Words
* Train-Test Split
* Search Random Forest's Best Pramaeters
* Classification
* Result

In [7]:
#tranform gender from str to int
dataset.gender = [1 if i=="male" else 0 for i in dataset.gender]
dataset.head(1)

Unnamed: 0,gender,description
0,1,i sing my own rhythm.


In [8]:
#Select a sample for example
sample = dataset.loc[6,:]
print("Description:",sample.description,"word count:",len(sample.description.split()))

Description: A global marketplace for images, videos and music. Sharing photos, inspiration, design tips & videos for the creative community. word count: 19


In [9]:
#Clean data from smile,point vs..
import re #for regular expression
sample.description = re.sub("[^a-zA-z]", " ",sample.description) 
print("Description: ",sample.description )

Description:  A global marketplace for images  videos and music  Sharing photos  inspiration  design tips   videos for the creative community 


In [10]:
#import nltk
#nltk.download('punkt')

#Word tokenize
sample.description = nlp.word_tokenize(sample.description) #split words: He didn't homework : "he","did","not","homework"
sample.description

['A',
 'global',
 'marketplace',
 'for',
 'images',
 'videos',
 'and',
 'music',
 'Sharing',
 'photos',
 'inspiration',
 'design',
 'tips',
 'videos',
 'for',
 'the',
 'creative',
 'community']

In [11]:
#nltk.download('stopwords')

#Clean data from stopwords
from nltk.corpus import stopwords
sample.description = [i for i in sample.description if not i in set(stopwords.words("english"))]
sample.description

['A',
 'global',
 'marketplace',
 'images',
 'videos',
 'music',
 'Sharing',
 'photos',
 'inspiration',
 'design',
 'tips',
 'videos',
 'creative',
 'community']

In [12]:
#nltk.download('wordnet')

#Get words root (lemmatization) example: videos -> video, tips -> tip

lemma = nlp.WordNetLemmatizer()
sample.description = [lemma.lemmatize(i) for  i in sample.description]
sample.description

['A',
 'global',
 'marketplace',
 'image',
 'video',
 'music',
 'Sharing',
 'photo',
 'inspiration',
 'design',
 'tip',
 'video',
 'creative',
 'community']

In [13]:
sample.description = " ".join(sample.description)
sample.description

'A global marketplace image video music Sharing photo inspiration design tip video creative community'

In [14]:
#All process implement all description

def preprocess(x):
    x = str(x)
    x = re.sub("[^a-zA-z]", " ",x)
    x = nlp.word_tokenize(x)
    #x = [i for i in x if not i in set(stopwords.words("english"))] #slowly
    x = [lemma.lemmatize(i) for  i in x]
    x = " ".join(x)
    return x

dataset.description = dataset.description.apply(preprocess)
dataset.description[0:10]

0                                 i sing my own rhythm
1    I m the author of novel filled with family dra...
2                  louis whining and squealing and all
3    Mobile guy er Shazam Google Kleiner Perkins Ya...
4    Ricky Wilson The Best FRONTMAN Kaiser Chiefs T...
5                                    you don t know me
6    A global marketplace for image video and music...
7       The secret of getting ahead is getting started
8                 Pll Fan Crazy about MCD Ramen is bae
9    Renaissance art historian University of Nottin...
Name: description, dtype: object

In [15]:
#Bag Of Words
from sklearn.feature_extraction.text import CountVectorizer

max_features = 1000

vectorizer = CountVectorizer(max_features=max_features, stop_words = "english")
sparce_matrix = vectorizer.fit_transform(dataset.description.values.astype('U')).toarray()

print("En sık kullanılan kelimeler:",vectorizer.get_feature_names())

En sık kullanılan kelimeler: ['__', '___', '_n', 'account', 'action', 'activist', 'actor', 'actress', 'add', 'addict', 'addicted', 'adult', 'adventure', 'advertising', 'advice', 'advocate', 'affiliate', 'aficionado', 'african', 'age', 'agency', 'ain', 'air', 'aka', 'alive', 'alternative', 'alum', 'amateur', 'amazing', 'ambassador', 'america', 'american', 'analysis', 'analyst', 'android', 'angel', 'animal', 'anime', 'answer', 'anti', 'app', 'apple', 'area', 'arsenal', 'art', 'artist', 'ask', 'aspiring', 'assistant', 'association', 'atheist', 'athlete', 'athletics', 'author', 'available', 'average', 'avid', 'award', 'away', 'awesome', 'baby', 'bad', 'bae', 'ball', 'band', 'bar', 'baseball', 'based', 'basketball', 'beach', 'bear', 'beat', 'beautiful', 'beauty', 'beer', 'believe', 'believer', 'best', 'better', 'bi', 'bieber', 'big', 'biggest', 'bio', 'bit', 'bitch', 'black', 'blacklivesmatter', 'blessed', 'blind', 'blog', 'blogger', 'blue', 'board', 'body', 'book', 'booking', 'bookings', '

In [16]:
#Train-Test Split
from sklearn.model_selection import train_test_split
X = sparce_matrix
y = dataset.gender
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1,random_state=42)
print("Train :",X_train.shape)
print("Test  :",X_test.shape)

Train : (14601, 1000)
Test  : (1623, 1000)


In [17]:
#Search Random Forest's Best Pramaeters

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
"""
from sklearn.model_selection import GridSearchCV
param_grid = [{'n_estimators':[10,50,100],
               'max_depth': np.arange(1, 10),
               'min_samples_leaf': [1, 5, 20, 50, 100],
               'min_weight_fraction_leaf': [0.0,0.1,0.3],
               'random_state':[1,4,7]}]
gridCV = GridSearchCV(estimator=rf, param_grid=param_grid,cv=10)
gridCV = gridCV.fit(X_train, y_train)

n_est = gridCV.best_params_["n_estimators"]
md = gridCV.best_params_["max_depth"]
msl = gridCV.best_params_["min_samples_leaf"]
mwfl = gridCV.best_params_["min_weight_fraction_leaf"]
rs = gridCV.best_params_["random_state"]
print(gridCV.best_score_)
print(gridCV.best_params_)
"""

'\nfrom sklearn.model_selection import GridSearchCV\nparam_grid = [{\'n_estimators\':[10,50,100],\n               \'max_depth\': np.arange(1, 10),\n               \'min_samples_leaf\': [1, 5, 20, 50, 100],\n               \'min_weight_fraction_leaf\': [0.0,0.1,0.3],\n               \'random_state\':[1,4,7]}]\ngridCV = GridSearchCV(estimator=rf, param_grid=param_grid,cv=10)\ngridCV = gridCV.fit(X_train, y_train)\n\nn_est = gridCV.best_params_["n_estimators"]\nmd = gridCV.best_params_["max_depth"]\nmsl = gridCV.best_params_["min_samples_leaf"]\nmwfl = gridCV.best_params_["min_weight_fraction_leaf"]\nrs = gridCV.best_params_["random_state"]\nprint(gridCV.best_score_)\nprint(gridCV.best_params_)\n'

In [18]:
#Classification
clf = RandomForestClassifier()#n_estimators=n_est, max_depth=md, min_samples_leaf=msl, min_weight_fraction_leaf=mwfl,random_state=rs
clf.fit(X_train, y_train)
pred = clf.predict(X_test)

In [19]:
#Result
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

acc = accuracy_score(y_test, pred)
f1 = f1_score(y_test, pred)

print("Accuracy :",acc)
print("F1 Score :",f1)

Accuracy : 0.67775723968
F1 Score : 0.422099447514


# THE END