In [1]:
import pandas as pd
import numpy as np
from plotly import tools, offline
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from collections import Counter
import re, unicodedata, zipfile, os, nltk
from nltk import LancasterStemmer, WordNetLemmatizer
from sklearn import model_selection
import plotly.plotly as py
import plotly.graph_objs as go
offline.init_notebook_mode(connected=True)

In [2]:
def read_files(path, folder):
    files = os.listdir(path+'/%s' % folder)    
    
    theme = []
    for i in range(len(files)):
        zfile = zipfile.ZipFile(path+'/{0}/{1}'.format(folder, files[i]))
        for finfo in zfile.infolist():
            ifile = zfile.open(finfo)
            text = ifile.read()
        theme.append(text)
    return theme

def generate_dataframe(path):
    data = pd.DataFrame()
    folder_files = os.listdir(path)
    for folder in folder_files:
        data = data.append(pd.DataFrame({'text': read_files(path, folder), 'target':folder}), ignore_index=True)
    return data

def del_word(df):
    df['text'] = df['text'].apply(lambda x: [word for word in x if len(word) > 3])
    df['text'] = df['text'].apply(lambda x: [word for word in x if not (word[0] and word[1] and word[2]) == word[0]])

def stem_words(df):
    stemmer = LancasterStemmer() 
    df['text'] = df['text'].apply(lambda x: [stemmer.stem(word) for word in x])

def tokenize(df):
    tokenizer = nltk.ToktokTokenizer()
    df['text'] = df['text'].apply(lambda x: tokenizer.tokenize(x))

def clean_text(text):
    new_text = re.sub(r"[^a-z ]", " ", text)
    return new_text

def delete_stop_words(df):
    stop = stopwords.words('english')
    df['text'] = df['text'].apply(lambda x: [word for word in x if word not in stop])
    
def preproc_text(df):
    df['text'] = df['text'].apply(lambda x: clean_text(str(x).lower()))
    tokenize(df)
    delete_stop_words(df)
    remove_non_ascii(df)
    stem_words(df)
    del_word(df)
    
def remove_non_ascii(df):
    df['text'] = df['text'].apply(lambda x: [unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore') for word in x])

In [20]:
data = generate_dataframe('data')
preproc_text(data)
data = data.sample(frac=1).reset_index(drop=True)

In [22]:
def to_dataframe_confussion_matrix(confussion_matrix, names):
    """Function to visualizate confussion_matrix"""
    
    actual = lambda: ['Actual ' + i for i in names]
    predicted = lambda: ['Predicted ' + i for i in names]
    
    df = pd.DataFrame(confussion_matrix, index=predicted(), columns=actual())
    
    return df

In [23]:
def PRA(precision, recall, f1, accuracity, names):
    """Function to visualizate metrics"""
    
    name_class = lambda: [i for i in names]
    
    df_1 = pd.DataFrame(precision, index=name_class(), columns=['Precision'])
    df_2 = pd.DataFrame(recall, index=name_class(), columns=['Recall'])
    df_3 = pd.DataFrame(f1, index=name_class(), columns=['F1'])
    df_4 = pd.DataFrame(accuracity, index=[names[0]], columns=['Accuracity'])
    df = pd.concat((df_1, df_2, df_3, df_4), axis=1, sort=False)
    
    df.fillna('', inplace=True)
    return df

In [24]:
class NaiveBayes():
    def __init__(self):
        pass
    
    def get_word_counts(self, X_slice):
            word_counts = {}
            count_of_words = 0

            for row in X_slice:
                for word in row:
                    word_counts[word] = word_counts.get(word, 0.0) + 1.0
                count_of_words += len(row)
                self.global_vocabluary.update(set(row))

            return word_counts, count_of_words

        
    def fit(self, X, y):
        unique_classes = np.unique(y)
        self.prob_of_each_class = {c: np.log(list(y).count(c)/y.shape[0]) for c in unique_classes}
        self.count_of_each_class = {c: list(y).count(c) for c in unique_classes}
        
        self.global_vocabluary = set()
        self.vocabluary = {class_: dict() for class_ in unique_classes}
        
        for c in unique_classes:
            X_slice = X[y == c]
            self.vocabluary[c]['voc_words'], self.vocabluary[c]['length'] = self.get_word_counts(X_slice)

        
    def predict(self, X):
        result = []
        for row in X:
            predicted = {}

            for class_ in self.count_of_each_class.keys():
                conjuction = []
                for word in row:
                    k = np.log(((self.vocabluary[class_]['voc_words'].get(word, 0) + 1)) / \
                                  (self.vocabluary[class_]['length'] + len(self.global_vocabluary)))
                    conjuction.append(k)
            
                predicted[class_] = sum(conjuction) + self.prob_of_each_class[class_]
            result.append(predicted)  
        return self.to_probability(result)
    
    
    def to_probability(self, result):
        """Transfrom from log to probability"""
        
        result_probability = []
        
        for row in result:
            temp = {}
            
            for class_ in row.keys():
                temp_y = row.copy()
                del temp_y[class_]
                denominator = [np.exp(row[i] - row[class_]) for i in temp_y]
                temp[class_] = 1 / (1 + sum(denominator))
            result_probability.append(temp)
            
        return result_probability

In [25]:
X = data['text']
y = data['target']

In [27]:
distribution = y.value_counts()
picture = [go.Bar(
            x=distribution.index,
            y=distribution
    )]
layout = go.Layout(
    title='Plot Title',
    yaxis=dict(
        title='Number of classes',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    )
)
fig = go.Figure(data=picture, layout=layout)
offline.iplot(fig)

As we see my data is imbalanced. So i need to use Stratified cross_validation for correct metrics

In [73]:
hot_class = {class_:i for i, class_ in enumerate(np.unique(data['target'].values))}
yy = y.replace(hot_class)

In [74]:
def train_test_split(X, y, test_size, random_state=42, shuffle=False):
    test_indexes, train_indexes = [], []
    np.random.seed(random_state)
    for class_ in np.unique(y):    
        N = y[y == class_].value_counts().values[0]
        part = int(round(N * test_size))

        full_indexes = np.array(y[y == class_].index)
        low = full_indexes.min()
        high = full_indexes.max()
        
        temp = np.random.randint(low, high, part)
        test_indexes.extend(temp)

        temp = np.delete(full_indexes, test_indexes)
        train_indexes.extend(temp)
        
    if shuffle is True:
        np.random.shuffle(train_indexes)
        np.random.shuffle(test_indexes)
        
    X_train = X.iloc[train_indexes]
    y_train = y.iloc[train_indexes]
    X_test = X.iloc[test_indexes]
    y_test = y.iloc[test_indexes]
    return X_train, X_test, y_train, y_test

In [75]:
X_train, X_test, y_train, y_test = train_test_split(X, yy, test_size=0.3, random_state=15, shuffle=True)


in the future out of bounds indices will raise an error instead of being ignored by `numpy.delete`.



When we use simple train test split we can't we sure if our data in equal proportotion. Such problem can cause wrong value of our metrics.
I use StratifiedKFold because we need correct percentage of samples of each target class.

In [76]:
def get_report(y_true, y_predicted):
    """Calcucate metrics such as: Precision, recall, f1 and accuracy"""
    
    N = len(list(y_predicted[0].keys()))
    confusion_matrix = np.zeros((N, N), dtype=int)
    
    for index in range(y_true.shape[0]):
        confusion_matrix[np.argmax(list(y_predicted[index].values()))][y_true[index]] += 1
        
    recall = np.array([confusion_matrix[i,i]/sum(confusion_matrix[:,i]) for i in range(N)])
    precision = np.array([confusion_matrix[i,i]/sum(confusion_matrix[i, :]) for i in range(N)])
    f1 = 2 * precision * recall / (precision + recall)
    
    correct_answers = 0
    for i in range(len(y_predicted)):
        if list(y_predicted[i].keys())[np.argmax(np.array(list(y_predicted[i].values())))] == y_true[i]:
            correct_answers += 1
    
    return confusion_matrix, precision, recall, f1, correct_answers/y_true.shape[0]

In [77]:
def get_data_for_plot(metric, name, classes):
    data = [go.Bar(
        x= classes,
        y= metric[split],
        text= metric[split],
        textposition = 'auto',
        name = 'split {}'.format(1+split),
        marker=dict(
            line=dict(
                color='rgb(8,48,107)',
                width=1.5),
            ),
        opacity=0.6
    ) for split in range(len(metric))]

    std = np.std(np.array(metric).flatten())
    layout = go.Layout(
        title='{0}.Standard deviation = {1}'.format(name, std)
        )
    fig = go.Figure(data=data, layout=layout)
    
    return fig

In [78]:
cross_valid = model_selection.StratifiedKFold(n_splits=3, random_state=5)
result = []
for train_index, test_index in cross_valid.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = yy[train_index], yy[test_index]
    naive = NaiveBayes()
    naive.fit(X_train.values, y_train.values)
    predicted = naive.predict(X_test.values)
    result.append(get_report(y_test.values, predicted))


overflow encountered in exp



In [79]:
to_dataframe_confussion_matrix(result[0][0],np.unique(data['target'].values))

Unnamed: 0,Actual DRAMA,Actual EROTIC,Actual RELIGION,Actual SONGS,Actual TEACHER
Predicted DRAMA,23,2,1,1,0
Predicted EROTIC,0,10,0,0,0
Predicted RELIGION,0,0,1,0,0
Predicted SONGS,0,0,0,6,0
Predicted TEACHER,1,0,0,0,7


In [80]:
to_dataframe_confussion_matrix(result[1][0],np.unique(data['target'].values))

Unnamed: 0,Actual DRAMA,Actual EROTIC,Actual RELIGION,Actual SONGS,Actual TEACHER
Predicted DRAMA,23,0,0,1,2
Predicted EROTIC,0,11,0,0,0
Predicted RELIGION,0,0,1,0,0
Predicted SONGS,0,1,0,6,0
Predicted TEACHER,0,0,0,0,5


In [81]:
to_dataframe_confussion_matrix(result[2][0],np.unique(data['target'].values))

Unnamed: 0,Actual DRAMA,Actual EROTIC,Actual RELIGION,Actual SONGS,Actual TEACHER
Predicted DRAMA,22,1,0,0,1
Predicted EROTIC,0,10,0,0,0
Predicted RELIGION,0,0,1,0,0
Predicted SONGS,1,0,0,7,0
Predicted TEACHER,0,0,0,0,5


In [82]:
classes = np.unique(y.values)
f1, recall, precision = [], [], []
for report in result:
    precision.append(report[1])
    recall.append(report[2])
    f1.append(report[3])

In [83]:
offline.iplot(get_data_for_plot(precision, 'Precision', classes))

Low precision means the following:
This is the destiny of objects which really belong to classN relative to all objects which model predict as classN.
It happened because model has learning objects.(Look at picture 'Distribution of classes')

In [84]:
offline.iplot(get_data_for_plot(recall, 'Recall', classes))

Lets look at this picture. Low recall means that model can't find all objects which belong to one class.

In [85]:
offline.iplot(get_data_for_plot(f1, 'F1', classes))

In [86]:
naive = NaiveBayes()
naive.fit(X_train.values, y_train.values)
predicted = naive.predict(X_test.values)


overflow encountered in exp



In [87]:
confusion_matrix, precision, recall, f1, acc = get_report(y_test.values, predicted)

In [88]:
PRA(precision, recall, f1, acc, np.unique(data['target'].values))

Unnamed: 0,Precision,Recall,F1,Accuracity
DRAMA,0.916667,0.956522,0.93617,0.9375
EROTIC,1.0,0.909091,0.952381,
RELIGION,1.0,1.0,1.0,
SONGS,0.875,1.0,0.933333,
TEACHER,1.0,0.833333,0.909091,


In [89]:
to_dataframe_confussion_matrix(confusion_matrix,np.unique(data['target'].values))

Unnamed: 0,Actual DRAMA,Actual EROTIC,Actual RELIGION,Actual SONGS,Actual TEACHER
Predicted DRAMA,22,1,0,0,1
Predicted EROTIC,0,10,0,0,0
Predicted RELIGION,0,0,1,0,0
Predicted SONGS,1,0,0,7,0
Predicted TEACHER,0,0,0,0,5
