In [11]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score

from time import time

df = pd.read_csv('complaints_demo.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Product,Consumer_complaint_narrative,category_id
0,1,Vehicle loan or lease,I contacted Ally on Friday XX/XX/XXXX after fa...,0
1,12,"Credit reporting, credit repair services, or o...",Hello This complaint is against the three cred...,1
2,13,"Credit reporting, credit repair services, or o...",I am a victim of Identity Theft & currently ha...,1
3,19,"Credit reporting, credit repair services, or o...",Two accounts are still on my credit history af...,1
4,22,"Credit reporting, credit repair services, or o...",Receiving daily telephone call ( s ) from XXXX...,1


In [12]:
df = df.drop(['Unnamed: 0'], axis=1)
df.head()

Unnamed: 0,Product,Consumer_complaint_narrative,category_id
0,Vehicle loan or lease,I contacted Ally on Friday XX/XX/XXXX after fa...,0
1,"Credit reporting, credit repair services, or o...",Hello This complaint is against the three cred...,1
2,"Credit reporting, credit repair services, or o...",I am a victim of Identity Theft & currently ha...,1
3,"Credit reporting, credit repair services, or o...",Two accounts are still on my credit history af...,1
4,"Credit reporting, credit repair services, or o...",Receiving daily telephone call ( s ) from XXXX...,1


In [13]:
tfidf = TfidfVectorizer(min_df=5, ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(df.Consumer_complaint_narrative).toarray()
labels = df.category_id

X_train, X_test, y_train, y_test,indices_train,indices_test = train_test_split(features, 
                                                               labels, 
                                                               df.index, test_size=0.3, 
                                                               random_state=1)

models = [
    RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    GaussianNB(),
    LogisticRegression(random_state=0)
]

# Cross-validation k-fold amb k=5:
k = 5

entries = []
for model in models:
    model_name = model.__class__.__name__
    start_time = time()
    accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=k)
    elapsed_time = time() - start_time # Temps d'execució del model amb el CV on k=5.
    print('Temps de execució del {} = {}'.format(model, elapsed_time))
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))
    
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
cv_df



Temps de execució del RandomForestClassifier(max_depth=5, random_state=0) = 0.7353634834289551
Temps de execució del LinearSVC() = 0.046816110610961914
Temps de execució del MultinomialNB() = 0.015622615814208984
Temps de execució del GaussianNB() = 0.031248092651367188




Temps de execució del LogisticRegression(random_state=0) = 0.34230685234069824


Unnamed: 0,model_name,fold_idx,accuracy
0,RandomForestClassifier,0,0.470588
1,RandomForestClassifier,1,0.470588
2,RandomForestClassifier,2,0.470588
3,RandomForestClassifier,3,0.470588
4,RandomForestClassifier,4,0.56
5,LinearSVC,0,0.72549
6,LinearSVC,1,0.647059
7,LinearSVC,2,0.666667
8,LinearSVC,3,0.705882
9,LinearSVC,4,0.8


In [14]:
# Mesurem el model amb el millor accuracy mitjà:
mac = cv_df.groupby('model_name').accuracy.mean()
mac.columns = ['Mean Accuracy']
mac

model_name
GaussianNB                0.507765
LinearSVC                 0.709020
LogisticRegression        0.523922
MultinomialNB             0.468549
RandomForestClassifier    0.488471
Name: accuracy, dtype: float64