In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
from pandas.tools.plotting import table

import warnings
warnings.simplefilter(action='ignore')
import seaborn as sns
import pandas as pd

%matplotlib inline

In [2]:
df = pd.read_csv('./datasets/total.csv')

In [3]:
df.head()

Unnamed: 0,subreddit,text
0,r/AskWomen,What’s something you had to learn the ‘hard wa...
1,r/AskWomen,Have you ever discovered that your were the to...
2,r/AskWomen,How do you reconnect with friends after you've...
3,r/AskWomen,Screw all these relationship questions/questio...
4,r/AskWomen,Ladies who thought they’d never find another S...


In [4]:
# Creating target group with 1 representing from r/science
df['is_men'] = df['subreddit'].map(lambda x: 1 if x=='r/AskMen' else 0)

In [5]:
df.head()

Unnamed: 0,subreddit,text,is_men
0,r/AskWomen,What’s something you had to learn the ‘hard wa...,0
1,r/AskWomen,Have you ever discovered that your were the to...,0
2,r/AskWomen,How do you reconnect with friends after you've...,0
3,r/AskWomen,Screw all these relationship questions/questio...,0
4,r/AskWomen,Ladies who thought they’d never find another S...,0


In [6]:
# Defining feature and target variables
X = df['text']
y = df['is_men']

In [7]:
# Splitting into train/test data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42,
                                                   stratify=y)

In [8]:
y_train.mean()

0.5016857720836143

In [9]:
pipe = Pipeline([
    ('cv', CountVectorizer(stop_words='english')),
    ('lr', LogisticRegressionCV(solver='liblinear'))
])
# pipe.fit(X_train, y_train)
# pipe.score(X_train, y_train), pipe.score(X_test, y_test)

In [10]:
pipe_params = {
    'cv__ngram_range': [(1,1), (1,2), (1,3), (1,4)],
    'cv__max_features': [20, 30, 50, 100, 300, 500, 1000],
    'lr__penalty': ['l1', 'l2']
}
gs = GridSearchCV(pipe, param_grid=pipe_params,
                 cv=3)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_





0.6254214430209035


{'cv__max_features': 500, 'cv__ngram_range': (1, 2), 'lr__penalty': 'l2'}

In [11]:
cv_features = gs.best_estimator_.named_steps['cv']

In [12]:
pipe1 = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('lr', LogisticRegressionCV(solver='liblinear'))
])

In [13]:
pipe1_params = {
    'tfidf__ngram_range': [(1,1), (1,2), (1,3)],
    'tfidf__max_features': [2000, 5000, 10000, 20000],
    'lr__penalty': ['l2']
}
gs1 = GridSearchCV(pipe1, param_grid=pipe1_params,
                  cv=3)
gs1.fit(X_train, y_train)
print(gs1.best_score_)
gs1.best_params_

0.643627781523938


{'lr__penalty': 'l2',
 'tfidf__max_features': 20000,
 'tfidf__ngram_range': (1, 2)}

In [14]:
pipe2 = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('rf', RandomForestClassifier())
])

In [15]:
pipe2_params = {
    'tfidf__ngram_range': [(1,1), (1,2), (1,3)],
    'tfidf__max_features': [20, 30, 50, 100, 300, 500, 1000, 2000],
    'rf__n_estimators': [10, 20, 50, 100],
    'rf__max_depth': [None, 2, 3, 4],
    'rf__max_features': ['auto', 1.0]
}

gs2 = GridSearchCV(pipe2, param_grid=pipe2_params,
                  cv=3)
gs2.fit(X_train, y_train)
print(gs2.best_score_)
gs2.best_params_

0.6287929871881321


{'rf__max_depth': 3,
 'rf__max_features': 'auto',
 'rf__n_estimators': 100,
 'tfidf__max_features': 1000,
 'tfidf__ngram_range': (1, 2)}

In [16]:
pipe3 = Pipeline([
    ('cv', CountVectorizer(stop_words='english')),
    ('rf', RandomForestClassifier())
])

In [17]:
pipe3_params = {
    'cv__ngram_range': [(1,1), (1,2), (1,3)],
    'cv__max_features': [20, 30, 50, 100, 300, 500, 1000, 2000],
    'rf__n_estimators': [10, 20, 50, 100],
    'rf__max_depth': [None, 2, 3, 4],
    'rf__max_features': ['auto', 1.0]
}

gs3 = GridSearchCV(pipe3, param_grid=pipe3_params,
                  cv=3)
gs3.fit(X_train, y_train)
print(gs3.best_score_)
gs3.best_params_

0.6217127444369521


{'cv__max_features': 2000,
 'cv__ngram_range': (1, 2),
 'rf__max_depth': None,
 'rf__max_features': 1.0,
 'rf__n_estimators': 100}

In [18]:
coefs = gs1.best_estimator_.named_steps['lr']
features = gs1.best_estimator_.named_steps['tfidf']
coef_df = pd.DataFrame({
    'text': features.get_feature_names(),
    'values': coefs.coef_[0]
})

In [19]:
coef_df.head()

Unnamed: 0,text,values
0,000,0.178422
1,10,-0.199523
2,10 15,0.015318
3,10 hours,-0.017722
4,10 minutes,-0.070219


In [20]:
coef_df.sort_values('values', ascending=False).head(20)


Unnamed: 0,text,values
4234,men,2.689344
2084,girl,1.605428
2283,guys,1.390469
2092,girlfriend,1.13996
18599,wife,1.08745
3516,man,1.050359
4339,men reddit,1.009452
2096,girls,0.826755
2272,guy,0.806678
334,attractive,0.710188


In [21]:
coef_df.sort_values('values', ascending=True).head(20)


Unnamed: 0,text,values
2904,ladies,-1.355179
18851,women,-1.33914
1793,favorite,-0.913469
1272,did,-0.713952
18970,women reddit,-0.686152
19121,work,-0.660386
7512,people,-0.655997
11960,self,-0.565912
1174,day,-0.557315
18182,wear,-0.519011
