In [1]:
# Load scikit's random forest classifier library
from sklearn.ensemble import RandomForestClassifier

from sklearn.feature_extraction.text import TfidfVectorizer

# Load pandas
import pandas as pd

# Load numpy
import numpy as np

# Set random seed
np.random.seed(42)

In [2]:
train = pd.DataFrame.from_csv('train.csv')
train['word'] = train.index
train.index = pd.RangeIndex(len(train.index))
train.head()

Unnamed: 0,Label,word
0,1,Аалтонен
1,0,Аар
2,0,Аарон
3,0,ААРОН
4,0,Аарона


In [4]:
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2,4))
train_counts = vectorizer.fit_transform(train['word'])
train_counts.shape

(101408, 54442)

In [5]:
clf = RandomForestClassifier(random_state=0, n_jobs=4)
clf.fit(train_counts, train['Label'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=4,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [6]:
test = pd.DataFrame.from_csv('test.csv')
test['word'] = test.index
test.index = pd.RangeIndex(len(test.index))
test.head()

Unnamed: 0,word
0,Аалто
1,ААР
2,Аара
3,Ааре
4,Аарон


In [7]:
test.shape

(188920, 1)

In [9]:
y = vectorizer.transform(test['word'])
y.shape

(188920, 54442)

In [10]:
res = clf.predict(y)
res.shape

(188920,)

In [11]:
test['Label'] = res
test['Id'] = test.index

In [12]:
test[test['Label']==1].sample(12)

Unnamed: 0,word,Label,Id
170565,Федорову,1,170565
143278,Сейди,1,143278
34610,Грунин,1,34610
35329,ГЭВИНУ,1,35329
103469,Остина,1,103469
144280,Сергеева,1,144280
54753,Инрайту,1,54753
177363,ХЬЮБЕРТ,1,177363
165168,Уайлсу,1,165168
177475,Хэнд,1,177475


In [13]:
test[['Id', 'Label']].to_csv('tfidf_submission.csv', index=False, header=['Id', 'Prediction'])