### Imports

In [107]:
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer
from ficlearn.feature_extraction.text import BnsTransformer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from itertools import product
from sklearn import svm
from sklearn import cross_validation
from sklearn import metrics
from ficlearn.metrics import crossValidationScores
import codecs as cs
from nltk.corpus import stopwords
import string
import pandas as pd
import numpy as np
import nltk
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns

from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import MultinomialNB

from ggplot import *

from stop_words import get_stop_words



# Functions

In [122]:
def sort_to_categories(df, lower=2500000, higher=4000000):

    #In: dataframe with all data points, lower and higher decision bounderies
    #Out: Dataframe with all data points sorted into respective category
    #Effect: Sorting data points into three categories, boundery determine by inout parameters lower and higher.
    
    price_list = df['price'].tolist()
    
    max_price = df['price'].max()
    min_price = df['price'].min()
    mean_price = df['price'].mean()
    
    categories = []
    
    for i, each in enumerate(price_list):
        if each < lower:
            categories.append(0)

        
        elif each > higher:
            categories.append(2)
            
        else:
            categories.append(1)
    
    column_values = pd.Series(categories)
    df.insert(loc=0, column='categories', value=column_values)
    
    return df
    

In [5]:
def preprocessing(df):
    #In: Dataframe that should be processed
    #Out: Tokenized dataframe. 
    #Effect: Removal of stopwords (Swedish most common words), non-letters, åäö is replaces with aa, ae, oe.
    
    stop = stopwords.words('swedish') + list(string.punctuation.encode('utf-8')) + ['gt', 'lt', 'amp', 'quot', 'align', '**', '***', '--', '//', '://', '),', ').']
    for i, s in enumerate(stop):
        stop[i] = str(s).replace(u'\xe5', 'aa').replace(u'\xe4', 'ae').replace(u'\xf6', 'oe')
    result = []
    for i, row in df.iterrows():
        sent = []
        doc = row['description']
        for word in nltk.wordpunct_tokenize(doc.lower()):
            if word not in stop and not is_int(word):
                sent.append(word)
        sent = ' '.join(sent)
        result.append(sent)
    df['tokens'] = result

In [6]:
def is_int(s):
    #In: token
    #Effect: Check if token is integer, if so return True. If not return False.
    
    try: 
        int(s)
        return True
    except ValueError:
        return False

In [114]:
def tfid_calc(vocab, train, test, n = 1):
    #In: Vocabulary of words in corpus, train and test data set and which n-gram model.
    #Out: Vectorized form of train and test data set
    #Effect: Vectorize text from the train and test sets
    
    tf = TfidfVectorizer(ngram_range=(n,n), min_df = 10)
    counts = tf.fit(vocab['tokens'])
    train_mtx = tf.transform(train).toarray()
    test_mtx = tf.transform(test).toarray()
    
    return train_mtx, test_mtx


In [115]:
def read_data():
    #Out: Y - Labels for all data points in data set, X - vectoized texts for each data points in the data set
    #     vocab - Vocuabulary of all words in the corpus
    #Effect: Reads in file of all listings in sweden to create a larger vocabulary. Reads in listings limited to the stockholm region.
    #        Preprocess the data frames. Creating labels for each datapoint.
    
    vocab = pd.read_json('output_new.json')
    df = pd.read_json('sthlm_format.json')
    print('Processing data....')
    preprocessing(vocab)
    preprocessing(df)
    Y = sort_to_categories(df)['categories']
    X = df['tokens']
    
    return X,Y, vocab

In [116]:
def classify(X,Y,vocab):
    
    test_size = 0.30
    print('Splitting into test and training sets')
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, Y, test_size = test_size)
        
    print('Calculating tfidf....')
   
    X_train, X_test = tfid_calc(vocab, X_train, X_test)
    
    print("Classifying....")
    
    gnb = MultinomialNB()

    y_pred = gnb.fit(X_train, y_train).predict(X_test)
    
    return [y_test, y_pred]
    

In [119]:
def get_metrics(results):
    #In: Result from the classification, true and predicted labels of the test data set. 
    #Effect: Calculating and prints average metrics over all iterations of the classifier
    
    #Number of iterations of the classifier
    N = len(results)
    
    #Initilizing matricies for the confusion matrix and the recall,precision and f-score matrices.
    list_of_confusion = [np.array([]) for i in range(N)]
    list_of_report = [np.array([]) for i in range(N)]

    
    #Calculate confusion matricies, prcision, recall anf fscore of each result
    for i,each in enumerate(results):

        list_of_confusion[i] = np.array(metrics.confusion_matrix(each[0], each[1]))

        list_of_report[i] =metrics.precision_recall_fscore_support(each[0], each[1])
    
    #Calculate mean and standard deviaion of instances of the confusion matrix
    Number_of_categoires = 3
    t=4
    range1 = lambda start, end: range(start, end+1) # modified range function
    mean_conf = np.mean([list_of_confusion[t-j] for j in range1(1,Number_of_categoires)], axis = 0)
    std_conf = np.std([list_of_confusion[t-j] for j in range1(1,Number_of_categoires)], axis = 0)

    #Initilizing precision, recall and f1-score matrices
    prec = [list_of_report[i][0] for i in range(0,N)]
    recalls = [list_of_report[i][1] for i in range(0,N)]
    f1scores = [list_of_report[i][2] for i in range(0,N)]
    
    
    #Calculating mean for precision, recall and f1 score
    prec_mean = np.mean([[list_of_report[i][0] for i in range(0,N)][j] for j in range(0,Number_of_categoires)], axis = 0)
    recall_mean = np.mean([[list_of_report[i][1] for i in range(0,N)][j] for j in range(0,Number_of_categoires)], axis = 0)
    f1_mean = np.mean([[list_of_report[i][2] for i in range(0,N)][j] for j in range(0,Number_of_categoires)], axis = 0)
    
    #Calculating Standard Deviation for precision, recall and f1 score
    prec_sd = np.std([[list_of_report[i][0] for i in range(0,N)][j] for j in range(0,Number_of_categoires)], axis = 0)
    recall_sd = np.std([[list_of_report[i][1] for i in range(0,N)][j] for j in range(0,Number_of_categoires)], axis = 0)
    f1_sd = np.std([[list_of_report[i][2] for i in range(0,N)][j] for j in range(0,Number_of_categoires)], axis = 0)
    
    print('Mean confusion Matrix')
    print(mean_conf)
    
    print('Confusion Matrix, standard deviation of each instance of the matrix')
    print(std_conf)
    
    print('Mean precision for each class:' + prec_mean)
    print('Mean Recall for each class:' + recall_mean)
    print('Mean F1-score for each class:' + f1_mean)
    
    print('St dev of precision for each class:' + prec_sd)
    print('St dev of Recall for each class:' + recall_sd)
    print('St dev of F1-score for each class:' + f1_sd)
    

In [None]:
def main(nr_iter=100):
    #In: NUmber of iterations of classification
    #Out: Results of the classification
    
    X,Y,vocab = read_data()
    
    results = []
    
    for i in range(nr_iter):
        results.append(classify(X,Y,vocab))
    
    return results
    

# Program

Run main(nr_iter) function with number of wished classificatin iterations (default=100). The result is the predcted and real labels for the test data set. 

In [120]:
results = main()

[ 0.69911339  0.49007476  0.76606571]
[ 0.54438058  0.69540131  0.56954089]
[ 0.61114828  0.57440066  0.65257248]
[ 0.01611135  0.02222723  0.01987316]
[ 0.03246322  0.01536031  0.02494182]
[ 0.0158282   0.01229015  0.00874642]


Print average metrics of the classification

In [None]:
get_metrics(results)