In [211]:
# import external libraries

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import re

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [249]:
%run scripts/features.py
%run scripts/eval.py

In [237]:
# load training and test data set
training = pd.read_table('./data/BingHackathonTrainingData.txt', sep='\t', header=None)
test = pd.read_table('./data/BingHackathonTestData.txt', sep='\t', header=None)

training = training[range(0,  6)]
test = test[range(0, 6)]

In [238]:
training = training.rename(columns={0: 'record_id', 1: 'topic_id', 2: 'publication_year', 3: 'authors', 4: 'title', 5: 'summary'})
test = test.rename(columns={0: 'record_id', 1: 'topic_id', 2: 'publication_year', 3: 'authors', 4: 'title', 5: 'summary'})

In [239]:
y = training.topic_id

In [240]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cross_validation import train_test_split

In [241]:
X_train, X_test, y_train, y_test = train_test_split(training, y, test_size=0.2, random_state=44)

In [242]:
print X_train.shape, X_test.shape, y_train.shape, y_test.shape

(3598, 6) (900, 6) (3598L,) (900L,)


In [243]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.feature_selection import chi2, SelectKBest

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.preprocessing import StandardScaler

import xgboost as xgb

from sklearn.pipeline import Pipeline

In [244]:
ft = FeatureTransformer()
logistic_regression = LogisticRegression(C=5, penalty='l2')

pipeline_log_reg = Pipeline([('ft', ft), ('log_reg', logistic_regression)])

In [233]:
ft = FeatureTransformer()
scaler = StandardScaler()
svc = SVC(kernel='linear', C=3.)

pipeline_svc = Pipeline([('ft', ft), ('scaler', scaler), ('svc', svc)])

In [253]:
ft = FeatureTransformer()
extreme_gradient_boosting = xgb.XGBClassifier(n_estimators=750, learning_rate=0.05, max_depth=4, min_child_weight=2)
pipeline_xgb = Pipeline([('ft', ft),('xgb', extreme_gradient_boosting)])

In [250]:
# fit logistic regression model

pipeline_log_reg.fit(X_train, y_train)

Pipeline(steps=[('ft', FeatureTransformer()), ('log_reg', LogisticRegression(C=5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0))])

## Cross-validation

In [40]:
scores = eval_model([pipeline_log_reg], X_train, y_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


accuracy score: 0.691667
combined score: 0.691667
accuracy score: 0.698611
combined score: 0.698611
accuracy score: 0.679167
combined score: 0.679167
[0.69166666666666665, 0.69861111111111107, 0.6791666666666667]


In [234]:
# fit linear svc model

pipeline_svc.fit(X_train, y_train)

Pipeline(steps=[('ft', FeatureTransformer()), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svc', SVC(C=3.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))])

In [254]:
# fit extreme gradient boosting model

pipeline_xgb.fit(X_train, y_train)

Pipeline(steps=[('ft', FeatureTransformer()), ('xgb', XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=4,
       min_child_weight=2, missing=None, n_estimators=750, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1))])

In [257]:
# predictions

# predsTrain_NB = pipeline_naive_bayes.predict(X_train.toarray())
predsTrain_LR = pipeline_log_reg.predict(X_train)
# predsTrain_RF = pipeline_rf.predict(X_train)
# predsTrain_SVC = pipeline_svc.predict(X_train)
# predsTrain_XGB = pipeline_xgb.predict(X_train)


# predsTest_NB = pipeline_naive_bayes.predict(X_test.toarray())
predsTest_LR = pipeline_log_reg.predict(X_test)
# predsTest_SVC = pipeline_svc.predict(X_test)
# predsTest_RF = pipeline_rf.predict(X_test)
# predsTest_XGB = pipeline_xgb.predict(X_test)

## Performance of ensemble on cross validation

In [140]:
scores = eval_model([pipeline_log_reg, pipeline_xgb], X_train, y_train)

accuracy score: 0.691667
accuracy score: 0.694444
combined score: 0.686111
accuracy score: 0.679167
accuracy score: 0.675000
combined score: 0.679167
accuracy score: 0.658333
accuracy score: 0.652778
combined score: 0.661111
[0.68611111111111112, 0.6791666666666667, 0.66111111111111109]


In [258]:
# print 'Accuracy on the training examples for Naive Bayes %f ' %(accuracy_score(y_train, predsTrain_NB))
# print 'Accuracy on the test examples for Naive Bayes %f ' %(accuracy_score(y_test, predsTest_NB))

# print '\n\n'
print 'Accuracy on the training examples for Logistic Regression %f ' %(accuracy_score(y_train, predsTrain_LR))
print 'Accuracy on the test examples for Logistic Regression %f ' %(accuracy_score(y_test, predsTest_LR))

# print 'Accuracy on the training examples for Linear SVC %f ' %(accuracy_score(y_train, predsTrain_SVC))
# print 'Accuracy on the test examples for Linear SVC %f ' %(accuracy_score(y_test, predsTest_SVC))

# print '\n\n'
# print 'Accuracy on the training examples for Random Forest Classifier %f ' %(accuracy_score(y_train, predsTrain_RF))
# print 'Accuracy on the test examples for Random Forest Classifier %f ' %(accuracy_score(y_test, predsTest_RF))

# print 'Accuracy on the training examples for XGBoost %f ' %(accuracy_score(y_train, predsTrain_XGB))
# print 'Accuracy on the test examples for XGBoost %f ' %(accuracy_score(y_test, predsTest_XGB))

Accuracy on the training examples for Logistic Regression 0.737910 
Accuracy on the test examples for Logistic Regression 0.727778 


In [14]:
print 'Classification report for training examples \n', classification_report(y_train, predsTrain_LR)
print 'Classification report for test examples \n', classification_report(y_test, predsTest_LR)

Classification report for training examples 
             precision    recall  f1-score   support

          0       0.79      0.89      0.84      1579
          1       0.67      0.79      0.73      1359
          2       0.61      0.21      0.32       660

avg / total       0.71      0.73      0.70      3598

Classification report for test examples 
             precision    recall  f1-score   support

          0       0.84      0.84      0.84       399
          1       0.64      0.83      0.73       360
          2       0.34      0.09      0.14       141

avg / total       0.68      0.72      0.68       900



In [106]:
confusion_matrix(y_train, predsTrain_LR)

array([[1406,  145,   28],
       [ 228, 1071,   60],
       [ 149,  371,  140]])

## Submission

In [259]:
# train on full training set
pipeline_log_reg.fit(training, y)

Pipeline(steps=[('ft', FeatureTransformer()), ('log_reg', LogisticRegression(C=5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0))])

In [260]:
# predictions
preds = pipeline_log_reg.predict(test)

In [261]:
submission_df = pd.DataFrame({'record_id': test.record_id.values, 'topic_id': preds})

In [262]:
submission_df.to_csv('submissions/third_classification.csv', sep='\t', index=False)

In [100]:
def all_authors(authors_text):
    authors = authors_text.split(';')
    return authors

authors_train = training.authors.map(all_authors)
authors_test = training.authors.map(all_authors)

In [102]:
authors_train_flat = [i for row in authors_train for i in row]
authors_test_flat = [i for row in authors_test for i in row]

In [107]:
authors_whole = authors_train_flat
authors_whole.extend(authors_test_flat)

In [110]:
training.authors.head()

0                          10723 10724;7648 8755 10725
1                              13677 13678;13679 13680
2    9809 5003 9120;1483 1319;2803 6006;6009 6010;7...
3                   8609 23978;18555 23979;22257 22258
4            10452 9243;5228 10681;1441 9975;5687 5688
Name: authors, dtype: object

In [98]:
set(all_authors_list1) - set(all_authors_list)

{'20581 2339',
 '2 99 24832',
 '13728 3 13729',
 '904 6092 1440',
 '2714 3 11318',
 '10434 23470',
 '20923 20924',
 '15309 15310',
 '18971 25624 2429 18972',
 '5595 19765',
 '10833 20424',
 '1812 22554',
 '1638 20952',
 '11264 22760 22761',
 '7202 9120',
 '26907 6615',
 '26456 482 3579',
 '28304 623',
 '2605 26734',
 '3825 23482',
 '6400 871 6401',
 '10297 2371',
 '3577 1242 5069',
 '97 10923 3448 23891',
 '755 25428',
 '3349 22926',
 '18404 11904',
 '15802 15803',
 '1267 482 21928',
 '9809 14735 6079',
 '622 5145',
 '755 70 14247',
 '19790 627',
 '3624 3140',
 '310 9022',
 '7668 27225',
 '576 2373',
 '66 3 21328',
 '9991 23754',
 '21592 11296',
 '141 142',
 '3747 1264',
 '309 871 10825',
 '4399 27600',
 '2061 28729',
 '17616 8252',
 '14742 3518',
 '3024 3025',
 '4235 17387',
 '4961 2338 4962',
 '310 6114',
 '18453 372',
 '67 871 20297',
 '14003 21158',
 '27057 97 24144',
 '1668 2338 25389',
 '3557 5405',
 '5668 5669 5670',
 '5714 18405',
 '22493 22494',
 '5595 26678',
 '24615 223 7924

In [29]:
authorList = []

def create_author_list(author_text):
    author_list = author_text.split(';')
    author_list_index = [all_authors_list.index(a) for a in author_list]
    
    authorList.append(author_list_index)

map(create_author_list, training.authors)

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [30]:
authorList[:10]

[[0, 1],
 [2, 3],
 [4, 5, 6, 7, 8],
 [9, 10, 11],
 [12, 13, 14, 15],
 [16, 17],
 [18, 19, 20],
 [21, 22],
 [23, 24, 25, 26, 27],
 [28, 29, 30]]

In [91]:
cvect = CountVectorizer()
cvect.fit_transform(X_train.copy())

<6x6 sparse matrix of type '<type 'numpy.int64'>'
	with 6 stored elements in Compressed Sparse Row format>

In [93]:
cvect.vocabulary_

{u'authors': 0,
 u'publication_year': 1,
 u'record_id': 2,
 u'summary': 3,
 u'title': 4,
 u'topic_id': 5}

In [36]:
import ptm.at_model as atm

In [35]:
atm = atm()

NameError: name 'atm_model' is not defined

In [46]:
cvect.vocabulary_

{u'authors': 0,
 u'publication_year': 1,
 u'record_id': 2,
 u'summary': 3,
 u'title': 4,
 u'topic_id': 5}