In [0]:
import lightgbm as lgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.utils import shuffle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import itertools
%matplotlib inline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.sparse import hstack, csr_matrix, vstack
import pickle as pickle

data = pd.read_csv('/content/drive/My Drive/gender_data.csv', engine='python' , encoding='utf-8', error_bad_lines=False)
#data.drop('Unnamed: 0',axis=1,inplace=True)
data.Gender = data.Gender.map({'Women':1,'Men':0})
data.head()







Unnamed: 0,text,Gender
0,Her şeyin doğallığı güzeldir.\nUsta olan doğad...,0
1,psikolojim bozuk,1
2,Toprak ananın şifalı mucizeleriyle tanışın: 8 ...,1
3,Bu hatlar neden çekmiyor öleceksek söyleyin şi...,1
4,Rüzgar eserken hindi sesi gibi bir ses geliyor...,0


In [0]:
df = data.copy()
df['text'] = df['text'].astype(str).fillna(' ')
# Lower case comment
df['text'] = df['text'].str.lower()
# Add num words of comment as feature
df['num_words'] = df['text'].apply(lambda s: len(s.split()))
# Add num words unique of comment as feature
df['num_unique_words'] = df['text'].apply(lambda s: len(set(w for w in s.split())))
# Add num words unique per num words of comment as feature
df['words_vs_unique'] = df['num_unique_words'] / df['num_words'] * 100


In [0]:
from sklearn.model_selection import train_test_split

train_df, test_df= train_test_split(df, test_size=0.25, random_state=39, stratify=df.Gender)
y_train = train_df['Gender'].values


In [0]:
train_df

Unnamed: 0,text,Gender,num_words,num_unique_words,words_vs_unique
2521095,"ayıp ettinnn aaaa, bizim karnabahar tarlasını...",0,11,10,90.909091
2363237,suyun bulandığı yerden geliyoruz. suyun gözesi...,0,13,12,92.307692
2228785,kısmet olursa hepimiz uçacağız birgün. 20 yıl ...,0,16,16,100.000000
1855674,her yeri bayır baca olan bir yerde(şırnak) yap...,1,15,15,100.000000
1758361,tugba bosverizm sen ben grupta konusurken beni...,1,15,15,100.000000
...,...,...,...,...,...
2697301,hahaha bilgili mi :) daha alfabeyi bilmiyor. o...,0,18,18,100.000000
1666075,yemek yemeye daima vakit bulurum https://twitt...,1,7,7,100.000000
2222781,haber başlığı böyle atılırsa alay konusu olmak...,1,12,12,100.000000
2426757,nasıl yakalamış!! :))pic.twitter.com/ccw149aykl,0,3,3,100.000000


In [0]:
# Just keep statistic feature to process by model
EXCLUED_COLS = ['text','Gender']
static_cols = [c for c in train_df.columns if not c in EXCLUED_COLS]
print(static_cols)
X_train_static = train_df[static_cols].values
X_test_static = test_df[static_cols].values
print(X_train_static.shape, X_test_static.shape)

['num_words', 'num_unique_words', 'words_vs_unique']
(2676762, 3) (892254, 3)


In [0]:
tfidf = TfidfVectorizer(
    min_df = 5, 
    max_df = 0.8, 
    max_features=10000,
    sublinear_tf=True
)

In [0]:
train_comments = train_df['text'].values
test_comments = test_df['text'].values
X_train_tfidf = tfidf.fit_transform(train_comments)
X_test_tfidf = tfidf.transform(test_comments)

In [0]:
with open('vectorizer.pk', 'wb') as fin:
    pickle.dump(tfidf, fin)

In [0]:
X_train = hstack([X_train_tfidf, csr_matrix(X_train_static)]).tocsr()
X_test = hstack([X_test_tfidf, csr_matrix(X_test_static)]).tocsr()

In [0]:
X_train_split, X_valid, y_train_split, y_valid = train_test_split(X_train, y_train, test_size=0.1)

In [0]:
param = {'num_leaves':100, 'num_trees':300, 'objective':'binary', "max_bin":255, "learning_rate":0.1}

In [0]:
train_data = lgb.Dataset(X_train_split, y_train_split)
valid_data = lgb.Dataset(X_valid, y_valid)

In [0]:
bst = lgb.train(param, train_data, num_boost_round=500, valid_sets=[valid_data])
print("accuracy: {}".format(accuracy_score(y_valid, 1*(bst.predict(X_valid)>0.5))))



[1]	valid_0's binary_logloss: 0.661435
[2]	valid_0's binary_logloss: 0.651003
[3]	valid_0's binary_logloss: 0.642308
[4]	valid_0's binary_logloss: 0.634921
[5]	valid_0's binary_logloss: 0.628765
[6]	valid_0's binary_logloss: 0.623626
[7]	valid_0's binary_logloss: 0.619069
[8]	valid_0's binary_logloss: 0.615254
[9]	valid_0's binary_logloss: 0.611965
[10]	valid_0's binary_logloss: 0.609077
[11]	valid_0's binary_logloss: 0.606561
[12]	valid_0's binary_logloss: 0.604271
[13]	valid_0's binary_logloss: 0.602364
[14]	valid_0's binary_logloss: 0.600727
[15]	valid_0's binary_logloss: 0.599167
[16]	valid_0's binary_logloss: 0.59753
[17]	valid_0's binary_logloss: 0.59635
[18]	valid_0's binary_logloss: 0.59518
[19]	valid_0's binary_logloss: 0.593889
[20]	valid_0's binary_logloss: 0.592719
[21]	valid_0's binary_logloss: 0.591631
[22]	valid_0's binary_logloss: 0.590538
[23]	valid_0's binary_logloss: 0.58948
[24]	valid_0's binary_logloss: 0.588712
[25]	valid_0's binary_logloss: 0.58776
[26]	valid_0's

In [0]:
new_pred = ["adamı hasta etmeyin oğlum"]
new_pred = tfidf.transform(new_pred)
bst.predict(new_pred)

array([0.27503364])

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
