In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import unicodedata
%matplotlib inline

from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import SnowballStemmer
from sklearn.utils.validation import check_is_fitted
from sklearn.preprocessing import OneHotEncoder, MaxAbsScaler
import string

In [2]:
%%file submissions/starting_kit/feature_extractor.py
# -*- coding: utf-8 -*-
from __future__ import unicode_literals

import pandas as pd
import numpy as np
import string
import unicodedata
import re
import scipy.sparse

from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import SnowballStemmer
from sklearn.utils.validation import check_is_fitted
from sklearn.preprocessing import OneHotEncoder, MaxAbsScaler

def clean_str(sentence, stem=False): # choose whether to stemmer
    english_stopwords = set(
        [stopword for stopword in stopwords.words('english')])
    punctuation = set(string.punctuation)
    punctuation.update(["``", "`", "...", "[", "]", "'", ",", "."])
    if stem:
        stemmer = SnowballStemmer('english')
        return list((filter(lambda x: x.lower() not in english_stopwords and
                            x.lower() not in punctuation,
                            [stemmer.stem(t.lower())
                             for t in word_tokenize(sentence)
                             if t.isalpha()])))

    return list((filter(lambda x: x.lower() not in english_stopwords and
                        x.lower() not in punctuation,
                        [t.lower() for t in word_tokenize(sentence)
                         if t.isalpha()])))


def strip_accents_unicode(s):
    if isinstance(s, float) or isinstance(s, int):
        s = str(s)
    try:
        s = unicode(s, 'utf-8')
    except NameError:  # unicode is a default on python 3
        pass 
    s = unicodedata.normalize('NFD', s) # NFC, or 'Normal Form Composed' returns composed characters, NFD, 'Normal Form Decomposed' gives you decomposed, combined characters.
    s = s.encode('ascii', 'ignore') # encodes a unicode string to ascii and ignores errors
    s = s.decode('utf-8', 'ignore')
    return str(s)

def feature_represent_fit(X_df):
    '''X_df: training dataframe
       Output: dictionary of useful categories with more than 10 word frequence/
               positions of dummy variables
    '''
    threshold = 10
    col_list = ['job', 'edited_by', 'researched_by', 'source', 
            'state', 'subjects']
    category_dict = {}
    position_dict = {}
    for col in col_list:
        i = 0
        category_dict[col] = []
        value_count = X_df[col].value_counts()
        
        for index, count in zip(value_count.index, value_count):
            if count > threshold:
                category_dict[col].append(index)
            else:
                X_df.loc[X_df[col]==index, col] = col + '_others'
                i += 1
        category_dict[col].append(col + '_others')
        #print('    column {}, unique items before group {}, after group {}'.format(
        #        col, len(value_count), len(value_count) - i + 1)) 
        # Get dummies of training data
        dummy_col = pd.get_dummies(X_df[col])
        # Position dictionary for each dummy variable
        position_dict[col] = dict(zip(dummy_col.columns, np.arange(len(dummy_col.columns))))
    return category_dict, position_dict

def feature_represent_transform(category_dict, position_dict, X_df):
        '''X_df: training or test dataframe'''
        col_list = ['job', 'edited_by', 'researched_by', 'source', 
                    'state', 'subjects']
        df_dummy = pd.DataFrame()
        for col in col_list:
            X_df.loc[~X_df[col].isin(category_dict[col]), col] = col + '_others'
            #print(len(set(X_df[col])), len(category_dict[col]))
            dummy_col = pd.get_dummies(X_df[col])
            # Elements in training set not in test set
            complete = list(set(category_dict[col]) - set(X_df[col]))
            if len(complete) != 0:
            # Complete lacked columns by position
                for w in complete:
                    if int(position_dict[col][w]) < len(dummy_col.columns):
                        dummy_col.insert(loc=int(position_dict[col][w]), column=w, value=0)
                    else:
                        dummy_col[w] = 0
            df_dummy = pd.concat([df_dummy, dummy_col], axis=1)
        return df_dummy




from sklearn.feature_extraction.text import TfidfVectorizer
class FeatureExtractor(TfidfVectorizer):
    """Convert a collection of raw documents to a matrix of TF-IDF features. """

    def __init__(self):
        super(FeatureExtractor, self).__init__(  # lets you avoid referring to the base class explicitly, which can be nice. But the main advantage comes with multiple inheritance
            input='content', encoding='utf-8',
            decode_error='ignore', strip_accents=None, lowercase=True,
            preprocessor=None, tokenizer=None, analyzer='word',
            stop_words=None, token_pattern=r"(?u)\b\w\w+\b",
            ngram_range=(1, 3), max_df=1.0, min_df=1,
            max_features=None, vocabulary=None, binary=False,
            dtype=np.int64, norm='l2', use_idf=True, smooth_idf=True,
            sublinear_tf=True)

    def fit(self, X_df, y=None):
        """Learn a vocabulary dictionary of all tokens in the raw documents.

        Parameters
        ----------
        raw_documents : iterable
            An iterable which yields either str, unicode or file objects.
        Returns
        -------
        self
        """
                        
        self._feat = np.array([' '.join(clean_str(strip_accents_unicode(dd)))
                                for dd in X_df.statement]) # statement
        super(FeatureExtractor, self).fit(self._feat)
        self.category_dict, self.position_dict = feature_represent_fit(X_df)
        return self
    
    
    def fit_transform(self, X_df, y=None):
        return self.fit(X_df).transform(X_df)

    def transform(self, X_df):       
        # Tf-idf for statement
        statement = np.array([' '.join(clean_str(strip_accents_unicode(dd)))
                              for dd in X_df.statement])
        check_is_fitted(self, '_feat', 'The tfidf vector is not fitted')
        tfidf = super(FeatureExtractor, self).transform(statement)
        # dummy coding
        X_dummy = feature_represent_transform(self.category_dict, self.position_dict, X_df)
        # Combine Tf-idf and dummy 
        array_dummy = np.array(X_dummy)
        matrix_all = scipy.sparse.hstack([tfidf, array_dummy]).tocsr()
        return matrix_all.todense()

Overwriting submissions/starting_kit/feature_extractor.py


In [7]:
%%file submissions/starting_kit/classifier.py
# -*- coding: utf-8 -*-
from sklearn.base import BaseEstimator
from sklearn.naive_bayes import MultinomialNB
import numpy as np
 

class Classifier(BaseEstimator):
    def __init__(self):
        self.clf =MultinomialNB(fit_prior = False)
        
    def fit(self, X, y):
        self.clf.fit(X, (y >= 3) * 1)
 
    def predict(self, X):
        return self.clf.predict(X) * 3 + 1
    
    def predict_proba(self, X):
        pred = self.clf.predict(X)
        probas = []
        for i in range(pred.shape[0]):
            if pred[i] ==1:
                proba = [0, 0, 0, 0, 1, 0]
            else:
                proba = [0, 1, 0, 0, 0, 0]
            probas.append(proba)
        return np.array(probas)

Overwriting submissions/starting_kit/classifier.py


In [8]:
!ramp_test_submission 

[38;5;178m[1mTesting Fake news: classify statements of public figures[0m
[38;5;178m[1mReading train and test files from ./data ...[0m
[38;5;178m[1mReading cv ...[0m
[38;5;178m[1mTraining ./submissions/starting_kit ...[0m
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  X_df.loc[X_df[col]==index, col] = col + '_others'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  X_df.loc[~X_df[col].isin(category_dict[col]), col] = col + '_others'
[38;5;178m[1mCV fold 0[0m
[3