In [None]:
%matplotlib inline
import re
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from nltk.corpus import stopwords
#stop_words = set(stopwords.words('english'))
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.pipeline import Pipeline
import seaborn as sns
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
import unidecode

In [None]:
stopwords_cust = stopwords.words('portuguese')

df_swc = pd.read_excel('C:\Users\marcel.massa.LGE\ML\stop_words_customized.xlsx')

# STOP_WORDS_CUSTOMIZED is the name of the column in the Excel file

df_swc.STOP_WORDS_CUSTOMIZED = df_swc.STOP_WORDS_CUSTOMIZED.str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8').str.lower()
for count in range (df_swc.shape[0]):
    stopwords_cust.append(df_swc.STOP_WORDS_CUSTOMIZED[count])


In [None]:
#Load Training data

df = pd.read_excel('C:\\Users\\marcel.massa.LGE\\ML\\train_multilabel_NEW.xlsx')

# TRAINING DATA ANALYSIS SECTION

In [None]:
df.head()

In [None]:
df.info()

In [None]:
#Number of comments in each category

df_symptoms = df.drop(['Claim'], axis=1)
counts = []
categories = list(df_symptoms.columns.values)
for i in categories:
    counts.append((i, df_symptoms[i].sum()))
df_stats = pd.DataFrame(counts, columns=['category', 'number_of_claims'])
df_stats

In [None]:
# same as above, but graphically

df_stats.plot(x='category', y='number_of_claims', kind='bar', legend=False, grid=True, figsize=(15, 5))
plt.title("Number of claims per category")
plt.ylabel('# of Occurrences', fontsize=12)
plt.xlabel('category', fontsize=12)

In [None]:
# Number of characters on the text claims

lens = df.Claim.str.len()
lens.hist(bins = np.arange(0,5000,50))

In [None]:
#How many comments have multiple labels?

rowsums = df.iloc[:,2:].sum(axis=1)
x=rowsums.value_counts()

#plot
plt.figure(figsize=(8,5))
ax = sns.barplot(x.index, x.values)
plt.title("Multiple categories per claim")
plt.ylabel('# of Occurrences', fontsize=12)
#plt.xlabel('# of categories', fontsize=12)

In [None]:
#Just checking

df['Claim'][0]

In [None]:
#categories = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

#Just checking
categories

In [None]:
# Split into train and test sets and prepare for vectorization

train, test = train_test_split(df, random_state=42, test_size=0.33, shuffle=True)

X_train = train.Claim
X_test = test.Claim
print(X_train.shape)
print(X_test.shape)

In [None]:
# Setupt the vectorizer and used it on the splitted training data
# Fit and transform steps are separated, God knwos why...

vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5, encoding='utf-8', strip_accents='unicode', ngram_range=(1,2), norm='l2', lowercase=True, stop_words=stopwords_cust)

vectorizer.fit(train.Claim)
x_train = vectorizer.transform(train.Claim)
y_train = train.drop(labels = ['Claim'], axis=1)

#vectorizer.fit(test.Claim)
x_test = vectorizer.transform(test.Claim)
y_test = test.drop(labels = ['Claim'], axis=1)

In [None]:
# using classifier chains
from skmultilearn.problem_transform import ClassifierChain

# initialize classifier chains multi-label classifier
classifier = ClassifierChain(LinearSVC())

# Training LinearSVC model on train data
classifier.fit(x_train, y_train)

# predict
predictions = classifier.predict(x_test)

# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions))
print("\n")

In [None]:
predictions.shape

In [None]:
df_new = pd.read_excel('C:\Users\marcel.massa.LGE\ML\CIC Call Receiving Rate_120515012.xls', header=1)
df_new.head()

In [None]:
from nltk.tokenize import word_tokenize

example = word_tokenize(df_new.loc[0 ,"Consultation Content"].lower())
example_clean = [w for w in example if not w in stopwords_cust]

print (example)
print (example_clean)

In [None]:
from nltk.tokenize.treebank import TreebankWordDetokenizer
TreebankWordDetokenizer().detokenize(example_clean)

In [None]:
df_new['Texto_Limpo'] = df_new.loc[: ,"Consultation Content"]
df_new.head()

In [None]:
len(df_new)

In [None]:
for j in range (len(df_new)): 
    tokens = word_tokenize(unidecode.unidecode(df_new.loc[j ,"Consultation Content"].lower()))
    clean_tokens = [w for w in tokens if not w in stopwords_cust]
    df_new.loc[j, 'Texto_Limpo'] = TreebankWordDetokenizer().detokenize(clean_tokens)
    if j == len(df_new)/2:
        print('... 50%'),
print('... [DONE]')

In [None]:
df_new.head()

In [None]:
df_new.info()

In [None]:
vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5, encoding='utf-8', strip_accents='unicode', ngram_range=(1,2), norm='l2', lowercase=True, stop_words=stopwords_cust)

features = vectorizer.fit_transform(df.Claim)
print(features.shape)

features_new = vectorizer.transform(df_new.loc[: ,"Consultation Content"])
print(features_new.shape)

In [None]:
from sklearn.feature_selection import chi2
#import numpy as np

N = 5
for i in categories:
    features_chi2 = chi2(features, df[i] )
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(vectorizer.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    print("# '{}':".format(i))
    print("  . Most correlated unigrams:\n       . {}".format('\n       . '.join(unigrams[-N:]).encode('utf-8')))
    print("  . Most correlated bigrams:\n       . {}".format('\n       . '.join(bigrams[-N:]).encode('utf-8')))

#for Sintoma, category_id in sorted(category_to_id.items()):
#  features_chi2 = chi2(features, labels == category_id)
#  indices = np.argsort(features_chi2[0])
#  feature_names = np.array(tfidf.get_feature_names())[indices]
#  unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
#  bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
#  print("# '{}':".format(Sintoma))
#  print("  . Most correlated unigrams:\n       . {}".format('\n       . '.join(unigrams[-N:]).encode('utf-8')))
#  print("  . Most correlated bigrams:\n       . {}".format('\n       . '.join(bigrams[-N:]).encode('utf-8')))

In [None]:
from skmultilearn.problem_transform import ClassifierChain
classifier = ClassifierChain(LinearSVC())

# Training LinearSVC model on train data
classifier.fit(features, df.drop(labels = ['Claim'], axis=1))

# predict
predictions_new = classifier.predict(features_new).toarray()
predictions_new.shape

In [None]:
test_DF = pd.DataFrame(predictions_new, columns=categories)
test_DF.head()

In [None]:
final_DF = pd.concat([df_new, test_DF], axis=1)

In [None]:
final_DF.head()

In [None]:
final_DF.to_excel('C:\Users\marcel.massa.LGE\ML\CIC Call Receiving Rate_MULTI_LABEL_2_PREDICTED.xlsx')
print('\n\n*** FINISHED ***')

In [None]:
df_new = df_new.reindex(columns=df_new.columns.tolist() + categories)   # add empty cols
df_new.head()

In [None]:
for i in range (len(categories)):
    print('... Processing {}'.format(categories[i])),
    for j in range (predictions_new.shape[0]):
        df_new.loc[j, categories[i]] = predictions_new[j,i]
        if j == predictions_new.shape[0]/2:
            print('... 50%'),
    print('... [DONE]')
    
print('\n\n*** FINISHED ***')

df_new.head()

In [None]:
df_new.to_excel('C:\Users\marcel.massa.LGE\ML\CIC Call Receiving Rate_MULTI_LABEL_2_PREDICTED_BETA.xlsx')
print('\n\n*** FINISHED ***')

-
-
-
-
-
Example for LabelPowerset
-
-
-
-
-

In [None]:
# using Label Powerset
from skmultilearn.problem_transform import LabelPowerset

# initialize label powerset multi-label classifier
classifier = LabelPowerset(LinearSVC())

# train
classifier.fit(x_train, y_train)

# predict
predictions = classifier.predict(x_test)

# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions))
print("\n")

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions, target_names=categories))