In [3]:
import pickle
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.utils import resample

class DenseTransformer(MinMaxScaler):

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.todense()

  from numpy.core.umath_tests import inner1d


In [4]:
df = pd.read_csv('text_store_label.csv')
df = df.fillna(" ")
u_class = df['store'].values
l  = df['link'].values
X  = df['text'].values
y  = df['label'].values

In [5]:
df.label.value_counts()

0    2073
1     545
Name: label, dtype: int64

In [9]:
df_majority = df[df.label==0]
df_minority = df[df.label==1]
 
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=len(df_majority),    # to match majority class
                                 random_state=123) # reproducible results
 
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
df_upsampled.label.value_counts()

1    2073
0    2073
Name: label, dtype: int64

In [38]:
u_class = df_upsampled['store'].values
l  = df_upsampled['link'].values
X  = df_upsampled['text'].values
y  = df_upsampled['label'].values

In [40]:
text_clf = Pipeline([
    ('tfidf', CountVectorizer()),
    ('tranf', DenseTransformer()),
    ('clf', RandomForestClassifier(n_estimators=200, n_jobs=3)),
])
text_clf.fit(l, y)
filename = 'link_clf3.sav'
pickle.dump(text_clf, open(filename, 'wb'))