# Estudos de Caso - Classificação Multi-classe


#### 1 - Importando a base de dados

In [1]:
%matplotlib inline
import pandas as pd

In [2]:
df = pd.read_csv('./Data/Consumer_Complaints.csv')
df = df.sample(frac=0.05, random_state=10)
df.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer Complaint,Company Public Response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date Sent to Company,Company Response to Consumer,Timely response?,Consumer disputed?,Complaint ID,Unnamed: 18
149494,03/14/2016,Payday loan,,Can't contact lender,,I have been receiving harassing threatening ph...,Company believes complaint caused principally ...,"EZCORP, INC.",OH,453XX,,Consent provided,Web,03/14/2016,Closed with explanation,Yes,No,1831941,
467117,01/20/2016,Credit reporting,,Credit monitoring or identity protection,Problem with fraud alerts,,Company chooses not to provide a public response,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",IN,46341,"Older American, Servicemember",,Phone,01/20/2016,Closed with non-monetary relief,Yes,No,1750672,
743113,09/27/2013,Mortgage,Conventional fixed mortgage,"Loan modification,collection,foreclosure",,,,Statebridge Company,GA,30052,,,Web,09/27/2013,Closed with explanation,Yes,No,545186,
545672,06/13/2014,Student loan,Non-federal student loan,Dealing with my lender or servicer,Need information about my balance/terms,,,AES/PHEAA,GA,30607,,,Web,06/13/2014,Closed with explanation,Yes,Yes,893762,
55857,07-06-2015,Credit reporting,,Incorrect information on credit report,Information is not mine,,,"EQUIFAX, INC.",NC,28334,,Consent not provided,Web,07-06-2015,Closed with explanation,Yes,No,1453534,


#### 2 - Limpando a base de dados

In [None]:
col = ['Product', 'Consumer complaint narrative']

df = df[col]
df = df[pd.notnull(df['Consumer complaint narrative'])]

df.columns = ['Product', 'Consumer_complaint_narrative']

df['category_id'] = df['Product'].factorize()[0]

In [None]:
category_id_df = df[['Product', 'category_id']].drop_duplicates().sort_values('category_id')

category_to_id = dict(category_id_df.values)

id_to_category = dict(category_id_df[['category_id', 'Product']].values)

#### 3 - Plotando as variáveis categóricas

In [None]:
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(8,6))
df.groupby('Product').Consumer_complaint_narrative.count().plot.bar(ylim=0)
plt.show()

#### 4 - Mala de palavras

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')

features = tfidf.fit_transform(df.Consumer_complaint_narrative).toarray()

labels = df.category_id

features.shape

#### 5 - Chi2 para encontrar termos mais relacionados a cada categoria

In [None]:
from sklearn.feature_selection import chi2

import numpy as np
N = 2
for Product, category_id in sorted(category_to_id.items()):
  features_chi2 = chi2(features, labels == category_id)
  indices = np.argsort(features_chi2[0])
  feature_names = np.array(tfidf.get_feature_names())[indices]
  unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
  bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
  print("# '{}':".format(Product))
  print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
  print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))

#### 6 - Treinando o classificador Naive-Bayes Multinomial

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

X_train, X_test, y_train, y_test = train_test_split(df['Consumer_complaint_narrative'], df['Product'], random_state = 0)

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()

X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

clf = MultinomialNB().fit(X_train_tfidf, y_train)

#### 7 - Fazendo previsões

In [None]:
print(clf.predict(count_vect.transform(["This company refuses to provide me verification and validation of debt per my right under the FDCPA. I do not believe this debt is mine."])))

In [None]:
print(clf.predict(count_vect.transform(["I am disputing the inaccurate information the Chex-Systems has on my credit report. I initially submitted a police report on XXXX/XXXX/16 and Chex Systems only deleted the items that I mentioned in the letter and not all the items that were actually listed on the police report. In other words they wanted me to say word for word to them what items were fraudulent. The total disregard of the police report and what accounts that it states that are fraudulent. If they just had paid a little closer attention to the police report I would not been in this position now and they would n't have to research once again. I would like the reported information to be removed : XXXX XXXX XXXX"])))

#### 8 - Selecionando o melhor classificador

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score

models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]

CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))

cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

cv_df.head(100)


#### 9 - Plotando os resultados do processo de seleção

In [None]:
import seaborn as sns

sns.boxplot(x='model_name', y='accuracy', data=cv_df)
sns.stripplot(x='model_name', y='accuracy', data=cv_df, 
              size=8, jitter=True, edgecolor="gray", linewidth=2)
plt.show()

#### 10 - Matriz de confusão SVC linear

In [None]:
model = LinearSVC()

X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, labels, df.index, test_size=0.33, random_state=0)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

from sklearn.metrics import confusion_matrix

conf_mat = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(10,10))

sns.heatmap(conf_mat, annot=True, fmt='d',
            xticklabels=category_id_df.Product.values, yticklabels=category_id_df.Product.values)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

#### 11 - Verificação de causa de classificações incorretas

In [None]:
from IPython.display import display

for predicted in category_id_df.category_id:
  for actual in category_id_df.category_id:
    if predicted != actual and conf_mat[actual, predicted] >= 10:
      print("'{}' predicted as '{}' : {} examples.".format(id_to_category[actual], id_to_category[predicted], conf_mat[actual, predicted]))
      display(df.loc[indices_test[(y_test == actual) & (y_pred == predicted)]][['Product', 'Consumer_complaint_narrative']])
      print('')

#### 12 - Chi2 para encontrar termos mais correlacionados com cada categoria

In [None]:
model.fit(features, labels)
N = 2
for Product, category_id in sorted(category_to_id.items()):
  indices = np.argsort(model.coef_[category_id])
  feature_names = np.array(tfidf.get_feature_names())[indices]
  unigrams = [v for v in reversed(feature_names) if len(v.split(' ')) == 1][:N]
  bigrams = [v for v in reversed(feature_names) if len(v.split(' ')) == 2][:N]
  print("# '{}':".format(Product))
  print("  . Top unigrams:\n       . {}".format('\n       . '.join(unigrams)))
  print("  . Top bigrams:\n       . {}".format('\n       . '.join(bigrams)))

#### 13 - Reporte de classificação

In [None]:
from sklearn import metrics
print(metrics.classification_report(y_test, y_pred, target_names=df['Product'].unique()))

# Estudos de Caso - Classificação Multi-label

#### 14 - Importando bibliotecas necessárias

In [None]:
import re
import matplotlib
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from sklearn.pipeline import Pipeline

df = pd.read_csv("./CSV/trainToxic.csv", encoding = "ISO-8859-1")
df.head(10)

#### 15 - Número de comentários em cada categoria

In [None]:
df_toxic = df.drop(['id', 'comment_text'], axis=1)
counts = []
categories = list(df_toxic.columns.values)

for i in categories:
    counts.append((i, df_toxic[i].sum()))
    
df_stats = pd.DataFrame(counts, columns=['category', 'number_of_comments'])
df_stats

#### 16 - Plotando números de comentários por categoria

In [None]:
df_stats.plot(x='category', y='number_of_comments', kind='bar', legend=False, grid=True, figsize=(8, 5))
plt.ylabel('# of Occurrences', fontsize=12)
plt.xlabel('category', fontsize=12)

#### 16 - Quantidade de comentários com rótulos múltiplos

In [None]:
rowsums = df.iloc[:,2:].sum(axis=1)
x=rowsums.value_counts()#plot
plt.figure(figsize=(8,5))
ax = sns.barplot(x.index, x.values)
plt.ylabel('# of Occurrences', fontsize=12)
plt.xlabel('# of categories', fontsize=12)

#### 17 - Porcentagem de comentários não rotulados

In [None]:
print(len(df[(df['toxic']==0) & (df['severe_toxic']==0) & (df['obscene']==0) & (df['threat']== 0) & (df['insult']==0) & (df['identity_hate']==0)]) / len(df))

#### 18 - Distribuição do número de caractéres nos comentários

In [None]:
lens = df[df['insult'] == 1].comment_text.str.len()
lens.hist(bins = np.arange(0,200,10))

#### 18 - Há comentários faltando?

In [None]:
df['comment_text'].isnull().sum()

#### 19 - Exemplo do primeiro comentário

In [None]:
df['comment_text'][0]

#### 20 - Limpeza de texto

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

df['comment_text'] = df['comment_text'].map(lambda com : clean_text(com))

df['comment_text'][0]

#### 21 - Separando dados de treinamento e testes

In [None]:
categories = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train, test = train_test_split(df, random_state=42, test_size=0.33, shuffle=True)
X_train = train.comment_text
X_test = test.comment_text
print(X_train.shape)
print(X_test.shape)

#### 22 - Pipeline UmContraTodos + Naive-Bayes Multinomial

In [None]:
NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(MultinomialNB(
                    fit_prior=True, class_prior=None))),
            ])

for category in categories:
    print('... Processing {}'.format(category))
    # treina
    NB_pipeline.fit(X_train, train[category])
    # prevê
    prediction = NB_pipeline.predict(X_test)
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))

#### 23 - Pipeline UmContraTodos + LinearSVC

In [None]:
SVC_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=5)),
            ])

for category in categories:
    print('... Processing {}'.format(category))
    # treina
    SVC_pipeline.fit(X_train, train[category])
    # prevê
    prediction = SVC_pipeline.predict(X_test)
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))