In [39]:
from __future__ import unicode_literals
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
import pandas as pd
import numpy as np
import nltk
import scipy
import urllib
from pathlib import Path 
%matplotlib inline

In [40]:
nltk.download("popular", quiet=True)

True

## Exploratory data analysis

###Loading the data

In [141]:
train_filename = Path('data/train.csv')
test_filename = Path('data/test.csv')
if train_filename.is_file() and test_filename.is_file():
    train = pd.read_csv(train_filename, sep=';', encoding='utf-8')
    test = pd.read_csv(test_filename, sep=';', encoding='utf-8')
else:
    %run train_test_generator.py    
frames = [train, test]
data = pd.concat(frames, axis=0)
fr_stopwords_url = "https://raw.githubusercontent.com/mkobbi/subvention-status-datacamp/master/data/stopwords-filter-fr.txt"
data = data.rename(columns=lambda x: x.decode('utf-8').encode('ascii', errors ='ignore'))

In [142]:
string_columns = ["Nom du partenaire", 'Intitul de la demande']
to_drop_columns = ["Anne", "Siret", "N SIMPA", 'CP-Adresse-Libell voie', "CP-Adresse-Ville", "y"] 
str_categorical_columns = ["Nom du partenaire", "Appel  projets","Appel  projets PolVille"]
num_categorical_columns = ["Anne", "CP-Adresse-Code postal"]
num_categorical_columns = ["CP-Adresse-Code postal"]
data = data.fillna(value=0, axis='columns')
data[string_columns] = data[string_columns].apply(lambda x: x.str.upper().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8'))
data[str_categorical_columns] = data[str_categorical_columns].apply(lambda x: x.astype('category').cat.codes)
data[num_categorical_columns] = data[num_categorical_columns].apply(lambda x: x.astype('int'))#.astype('category'))
fr_stopwords = urllib.urlopen(fr_stopwords_url).read().decode("utf-8").upper()
fr_stopwords = fr_stopwords.split('\n')
y = np.ravel(data['y'], axis='columns')
data = data.drop(['y'], axis='columns')

In [143]:
print(np.unique(data['Nom du partenaire']))
print(len(np.unique(data['Nom du partenaire'])))

[   0    1    2 ... 6066 6067 6068]
6069


In [144]:
data.dtypes

Unnamed: 0                           int64
Nom du partenaire                    int16
Intitul de la demande               object
Appel  projets                        int8
Montant vot par demande            float64
Fonctionnement                     float64
Subventions sur projet             float64
Subventions d'quipement            float64
Subventions sans nature spcifie    float64
CP-Adresse-Code postal               int32
Appel  projets PolVille               int8
S-DE-Montant demand                float64
dtype: object

In [145]:
stemmer = nltk.stem.snowball.FrenchStemmer()
data['stemmed'] = data['Intitul de la demande'].map(lambda x: ' '.join([stemmer.stem(y) for y in x.split(' ')]))

In [146]:
from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer(norm='l2', min_df=0, max_df=1, use_idf=True, smooth_idf=True,
                       sublinear_tf=True, stop_words=fr_stopwords, analyzer='word')
words = vect.fit_transform(data.stemmed)
#words = pd.DataFrame(words.todense())
#df = df.loc[~df.index.duplicated(keep='first')]
#print(words.shape, list(data))
#data = pd.concat([data, words], axis='columns')
data = data.drop(['Intitul de la demande', 'stemmed'], axis='columns')

In [147]:
data_sparse = scipy.sparse.csr_matrix(data.values[:,1:])
X = normalize(scipy.sparse.hstack((data_sparse, words)))

In [148]:
print(X.shape, y.shape)

((26273, 5746), (446641L,))


In [149]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

ValueError: Found input variables with inconsistent numbers of samples: [26273, 446641]

## Classifying

In [135]:
clf = SVC(kernel='linear', class_weight='balanced', random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [136]:
accuracy_score(y_test, y_pred)

0.6872146118721462

In [None]:
data.describe()

In [129]:
data.count()

Unnamed: 0                         26273
Nom du partenaire                  26273
Appel  projets                     26273
Montant vot par demande            26273
Fonctionnement                     26273
Subventions sur projet             26273
Subventions d'quipement            26273
Subventions sans nature spcifie    26273
CP-Adresse-Code postal             26273
Appel  projets PolVille            26273
S-DE-Montant demand                26273
y                                  26273
dtype: int64

The dataset contains 26000+ instances mostly numericals and one categorical feature.