In [1]:
import scipy as sp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import sparse
%matplotlib inline

In [2]:
train_filename = 'data/train.csv'
data = pd.read_csv(train_filename, sep='\t')
data = data.fillna('')
data['date'] = pd.to_datetime(data['date'])

test_filename = 'data/test.csv'
data_test = pd.read_csv(test_filename, sep='\t')
data_test = data_test.fillna('')
data_test['date'] = pd.to_datetime(data_test['date'])

In [3]:
def f(df):
    df['Year'] = pd.DatetimeIndex(df['date']).year
    df['Month'] = pd.DatetimeIndex(df['date']).month
    df['Day'] = pd.DatetimeIndex(df['date']).day
    return df

In [4]:
#%%file submissions/starting_kit/feature_extractor.py
# -*- coding: utf-8 -*-
from __future__ import unicode_literals

import numpy as np
import string
import unicodedata

from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import SnowballStemmer
from sklearn.utils.validation import check_is_fitted
from sklearn.preprocessing import OneHotEncoder, MaxAbsScaler

def clean_str(sentence, stem=True):
    english_stopwords = set(
        [stopword for stopword in stopwords.words('english')])
    punctuation = set(string.punctuation)
    punctuation.update(["``", "`", "..."])
    if stem:
        stemmer = SnowballStemmer('english')
        return list((filter(lambda x: x.lower() not in english_stopwords and
                            x.lower() not in punctuation,
                            [stemmer.stem(t.lower())
                             for t in word_tokenize(sentence)
                             if t.isalpha()])))

    return list((filter(lambda x: x.lower() not in english_stopwords and
                        x.lower() not in punctuation,
                        [t.lower() for t in word_tokenize(sentence)
                         if t.isalpha()])))

def strip_accents_unicode(s):
    try:
        s = unicode(s, 'utf-8')
    except NameError:  # unicode is a default on python 3
        pass
    s = unicodedata.normalize('NFD', s)
    s = s.encode('ascii', 'ignore')
    s = s.decode("utf-8")
    return str(s)

from sklearn.feature_extraction.text import TfidfVectorizer
class FeatureExtractor(TfidfVectorizer):
    """Convert a collection of raw documents to a matrix of TF-IDF features. """

    def __init__(self):
        super(FeatureExtractor, self).__init__(
            input='content', encoding='utf-8',
            decode_error='strict', strip_accents=None, lowercase=True,
            preprocessor=None, tokenizer=None, analyzer='word',
            stop_words=None, token_pattern=r"(?u)\b\w\w+\b",
            ngram_range=(1, 1), max_df=1.0, min_df=1,
            max_features=None, vocabulary=None, binary=False,
            dtype=np.int64, norm='l2', use_idf=True, smooth_idf=True,
            sublinear_tf=False)
        
    def fit(self, X_df, y=None):
        """Learn a vocabulary dictionary of all tokens in the raw documents.

        Parameters
        ----------
        raw_documents : iterable
            An iterable which yields either str, unicode or file objects.
        Returns
        -------
        self
        """
        self._feat = np.array([' '.join(
            clean_str(strip_accents_unicode(dd)))
            for dd in X_df.statement])
        super(FeatureExtractor, self).fit(self._feat)
        return self

    def fit_transform(self, X_df, y=None):
        self.fit(X_df)
        return self.transform(self.X_df)

    def transform(self, X_df):
        X = np.array([' '.join(clean_str(strip_accents_unicode(dd)))
                      for dd in X_df.statement])
        check_is_fitted(self, '_feat', 'The tfidf vector is not fitted')
        X = super(FeatureExtractor, self).transform(X)
        return X

In [5]:
#%%file submissions/starting_kit/classifier.py
# -*- coding: utf-8 -*-
from sklearn.base import BaseEstimator
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

class Classifier(BaseEstimator):
    def __init__(self):
        #self.clf = RandomForestClassifier()
        self.clf = MLPClassifier(hidden_layer_sizes=(300, 300, 300))
        #self.clf = SVC()
        #self.clf = LogisticRegression(C=1e5, solver='sag')
        #self.clf = MultinomialNB()

    def fit(self, X, y):
        self.clf.fit(X.todense(), y)

    def predict(self, X):
        return self.clf.predict(X.todense())

    def predict_proba(self, X):
        return self.clf.predict_proba(X)

In [6]:
y_train = data.truth
X_tr = f(data.drop("truth", axis=1)).drop("date", axis=1)
texto_train = X_tr.iloc[:,:7] 
fecha_train = X_tr.iloc[:,7:] 

y_test = data_test.truth
X_te = f(data_test.drop("truth", axis=1)).drop("date", axis=1)
texto_test = X_te.iloc[:,:7] 
fecha_test = X_te.iloc[:,7:] 

df_texto = texto_train.append(texto_test, ignore_index=True)
df_fecha = fecha_train.append(fecha_test, ignore_index=True)
df_y = y_train.append(y_test, ignore_index=True)

In [7]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()

job = pd.get_dummies(pd.DataFrame(df_texto['job']).replace(to_replace='', value='Desconocido'))
source = df_texto['source'].astype('category').cat.codes
"""state = pd.get_dummies(pd.DataFrame(df_texto['state']).replace({'':'Desconocido',
                                                                   'Washington, D.C.':'District of Columbia',
                                                                   'Washington state':'Washington',
                                                                   'Rhode island':'Rhode Island',
                                                                   'Tennessee':'Tennesse',
                                                                   'Virgina':'Virginia',
                                                                  'ohio':'Ohio'}))"""
state = df_texto['state'].replace({'':'Desconocido', 'Washington, D.C.':'District of Columbia',
                                       'Washington state':'Washington','Rhode island':'Rhode Island',
                                        'Tennessee':'Tennesse', 'Virgina':'Virginia',
                                       'ohio':'Ohio'}).astype('category').cat.codes
ptm = pd.DataFrame((df_texto['subjects'].str.strip('[]')).str.replace("'",""))
hdp = []
for i in range(len(ptm.values)):
    ptm_uni = [np.char.strip(x.split(',')) for x in ptm.values[i]]
    hdp.append(ptm_uni)
subjects = pd.Series((v[0] for v in hdp)).str.join(sep='*').str.get_dummies(sep='*')
df_texto = df_texto.drop(['subjects'],axis=1)
df_texto = pd.concat([df_texto, subjects],axis=1)
df = pd.DataFrame(df_texto[['edited_by', 'researched_by']], 
                  columns=['edited_by', 'researched_by'])
df["edited_by"] = df["edited_by"].str.split(", ")
df["researched_by"] = df["researched_by"].str.split(", ")
left = df.join(pd.DataFrame(mlb.fit_transform(df.pop('edited_by')),
                          columns=mlb.classes_,
                          index=df.index)).drop(['','researched_by'], axis=1)
right =  df.join(pd.DataFrame(mlb.fit_transform(df.pop('researched_by')),
                          columns=mlb.classes_,
                          index=df.index)).drop([''], axis=1)
e = set(left)
r = set(right)
df=right.add(2*left[list(e.intersection(r))]).fillna(right)
edited_researched = pd.concat([df, 2*left[list(e.difference(r))]], axis=1)
cl = FeatureExtractor()
cl.fit(pd.DataFrame(df_texto['statement']))
statement = cl.transform(pd.DataFrame(df_texto['statement']))

In [8]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize

pca_statement = PCA(svd_solver='full')
pca_statement.fit(statement.toarray())

PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='full', tol=0.0, whiten=False)

In [9]:
np.argmax(np.cumsum(pca_statement.explained_variance_ratio_) > 0.99)

4745

In [10]:
pca_statement.n_components = 4745
X_statement = pca_statement.fit_transform(statement.toarray())

In [11]:
y = df_y.values
#X = np.array(pd.concat([edited_researched,job,source,state,subjects,pd.DataFrame(statement.toarray())], axis=1))
#sparcido = sparse.csr_matrix(np.array(pd.concat([edited_researched, job, source, state, subjects], axis=1)))
#X = sp.sparse.hstack([sparcido, statement])
X = np.array(pd.concat([edited_researched,job,source,state,subjects,pd.DataFrame(X_statement)], axis=1))

In [12]:
X.shape

(10460, 5204)

In [13]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize

pca = PCA()
pca.fit(normalize(X))
np.argmax(np.cumsum(pca.explained_variance_ratio_) > 0.9999)

1025

In [14]:
pca.n_components = 1025
X_pca = pca.fit_transform(normalize(X))

In [15]:
X_train, X_test = np.split(X_pca, [data.shape[0]])
y_train, y_test = np.split(y, [data.shape[0]])

In [20]:
statement.toarray()[statement.toarray() < 0.0]

array([], dtype=float64)

In [33]:
from sklearn.metrics import accuracy_score

clf = Classifier()
clf.fit(sparse.csr_matrix(X_pca), y)
pred = clf.predict(sparse.csr_matrix(X_test))

print(accuracy_score(y_test, pred))

0.417156693186


In [14]:
X_train.shape

(7569, 7753)

In [None]:
!ramp_test_submission --quick-test