In [1]:
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer
from ficlearn.feature_extraction.text import BnsTransformer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from itertools import product
from sklearn import svm
from sklearn import cross_validation
from sklearn import metrics
from ficlearn.metrics import crossValidationScores
import codecs as cs
from nltk.corpus import stopwords
import string
import pandas as pd
import numpy as np
import nltk
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns

from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import MultinomialNB

from ggplot import *


You can access Timestamp as pandas.Timestamp
  pd.tslib.Timestamp,
  from pandas.lib import Timestamp
  from pandas.core import datetools


In [2]:
def sort_to_categories(df):
    
    price_list = df['price'].tolist()
    
    max_price = df['price'].max()
    min_price = df['price'].min()
    mean_price = df['price'].mean()
    
    cate = []
    
    for i, each in enumerate(price_list):
        if each < 2500000:
            cate.append(0)

        
        elif each > 4000000:
            cate.append(2)
            
        else:
            cate.append(1)
    
    column_values = pd.Series(cate)
    df.insert(loc=0, column='categories', value=column_values)
    
    return df
    

In [3]:
def tfid_calc(X, n = 1):
    tf = TfidfVectorizer(ngram_range=(n,n), min_df = 10)
    
    x = tf.fit_transform(X)

    return x

In [4]:
def preprocessing(df):
    stop = stopwords.words('swedish') + list(string.punctuation.encode('utf-8')) + ['gt', 'lt', 'amp', 'quot', 'align', '**', '***', '--', '//', '://', '),', ').']
    for i, s in enumerate(stop):
        stop[i] = str(s).replace(u'\xe5', 'aa').replace(u'\xe4', 'ae').replace(u'\xf6', 'oe')
    result = []
    for i, row in df.iterrows():
        sent = []
        doc = row['description']
        for word in nltk.wordpunct_tokenize(doc.lower()):
            if word not in stop and not is_int(word):
                sent.append(word)
        sent = ' '.join(sent)
        result.append(sent)
    df['tokens'] = result

In [6]:
def is_int(s):
    try: 
        int(s)
        return True
    except ValueError:
        return False

In [7]:
def tfid_calc(vocab, train, test, n = 1):
    tf = TfidfVectorizer(ngram_range=(n,n), min_df = 10)
    counts = tf.fit(vocab['tokens'])
    train_mtx = tf.transform(train).toarray()
    test_mtx = tf.transform(test).toarray()
    

    return train_mtx, test_mtx


In [8]:
def read_data():
    vocab = pd.read_json('output_new.json')
    df = pd.read_json('sthlm_format.json')
    print('Processing data....')
    preprocessing(vocab)
    preprocessing(df)
    Y = sort_to_categories(df)['categories']
    X = df['tokens']
    
    return X,Y, vocab

In [14]:
def run(X,Y,vocab):
    
    
    test_size = 0.30
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, Y, test_size = test_size, random_state = 0)
    
    
    print('Calculating tfidf....')
   
    X_train, X_test = tfid_calc(vocab, X_train, X_test,1)
    
    print("Classifying....")
    
    
    tree = DecisionTreeClassifier(max_depth=4)
    
    kn = KNeighborsClassifier(n_neighbors=6)
    
    svm = SVC(kernel='linear', probability=True)
    
    gnb = MultinomialNB()
    voting = VotingClassifier(estimators=[('dt', tree), ('knn', kn),
                                         ('bayes', gnb)],
                            voting='soft', weights=[2, 1, 2])

    print('tree')
    y_pred_tree = tree.fit(X_train, y_train).predict(X_test)
    print('kn')
    y_pred_kn = kn.fit(X_train, y_train).predict(X_test)
    #y_pred_svm = svm.fit(X_train, y_train).predict(X_test)
    print('bayes')
    y_pred_bayes = gnb.fit(X_train, y_train).predict(X_test)
    print('voting')
    #y_pred_voting = voting.fit(X_train, y_train).predict(X_test)
        
    print(metrics.accuracy_score(y_test, y_pred_tree))
    print(metrics.accuracy_score(y_test, y_pred_kn))
    #print(metrics.accuracy_score(y_test, y_pred_svm))
    print(metrics.accuracy_score(y_test, y_pred_bayes))
    #print(metrics.accuracy_score(y_test, y_pred_voting))
    
    print(metrics.confusion_matrix(y_test, y_pred_tree))
    print(metrics.confusion_matrix(y_test, y_pred_kn))
    #print(metrics.confusion_matrix(y_test, y_pred_svm))
    print(metrics.confusion_matrix(y_test, y_pred_bayes))
    #print(metrics.confusion_matrix(y_test, y_pred_voting))
    
    
    return y_test, y_pred_bayes
    

In [10]:
X,Y,vocab = read_data()

Processing data....


In [15]:
y_test, y_pred_bayes = run(X,Y,vocab)

Calculating tfidf....
Classifying....
tree
kn
bayes
voting
0.439941046426
0.523212969786
0.610169491525
0.567428150332
[[107 296  25]
 [ 50 372  58]
 [  7 324 118]]
[[224 195   9]
 [112 310  58]
 [ 57 216 176]]
[[225 194   9]
 [ 70 356  54]
 [ 17 185 247]]
[[177 237  14]
 [ 68 348  64]
 [ 13 191 245]]


### Get A classification_report

In [20]:
target_names = ['Low', 'Medium', 'High']

print(metrics.classification_report(y_test, y_pred_bayes, target_names = target_names))

             precision    recall  f1-score   support

        Low       0.72      0.53      0.61       428
     Medium       0.48      0.74      0.59       480
       High       0.80      0.55      0.65       449

avg / total       0.66      0.61      0.61      1357



### Calculate R2-Score for classification

In [22]:
vocab = pd.read_json('output_new.json')
df = pd.read_json('sthlm_format.json')
print('Processing data....')
preprocessing(vocab)
preprocessing(df)
df = sort_to_categories(df)

Processing data....


In [23]:
means = df.groupby('categories').mean()['price'].tolist()
medians = df.groupby('categories').median()['price'].tolist()
y_pred = y_pred_bayes
y_test1 = y_test.as_matrix()

for i in range(3):
    
    np.place(y_pred, y_pred==i, means[i])
    np.place(y_pred, y_pred==i, means[i])
    np.place(y_pred, y_pred==i, means[i])
    
for i in range(3):
    
    np.place(y_test1, y_test1==i, means[i])
    np.place(y_test1, y_test1==i, means[i])
    np.place(y_test1, y_test1==i, means[i])
    


In [24]:
metrics.r2_score(y_test, y_pred)


0.26784654045808698

In [25]:
print(medians)
print(means)

[2000000, 3195000, 5600000]
[2014851.389878831, 3226690.8459214503, 6698648.409556314]


### For visualization of distribution of category samples

In [None]:
import matplotlib.pyplot as plt
ax = df.groupby('categories').size().plot(kind='bar', figsize=(10,2))

In [None]:
plt.show()

### Some play with the number of features, preserved Variance

In [None]:
X = df['tokens']
X_tfidf = tfid_calc(X)

In [None]:
from scipy.linalg import svd
from sklearn.decomposition import TruncatedSVD

ncomps = 1000
svd = TruncatedSVD(n_components=ncomps)
svd_fit = svd.fit(X_tfidf)
Y = svd.fit_transform(X_tfidf) 
ax = pd.Series(svd_fit.explained_variance_ratio_.cumsum()).plot(kind='line', figsize=(10,3))
print('Variance preserved by first 1000 components == {:.2%}'.format(
        svd_fit.explained_variance_ratio_.cumsum()[-1]))

In [None]:
plt.show()

### Vizualizatino of data 2D

In [None]:
new_X = pd.DataFrame(Y, columns=['c{}'.format(c) for c in range(ncomps)])

In [None]:
import seaborn as sns
sns.set(style="darkgrid", palette="muted")


In [None]:


plotdims = 5
ploteorows = 1
svdcols = [c for c in new_X.columns if c[0] == 'c']
dfsvdplot = new_X[svdcols].iloc[:,:plotdims]
dfsvdplot['class'] = df['class']
ax = sns.pairplot(dfsvdplot.iloc[::ploteorows,:], hue='class', size=1.8)

In [None]:
plt.show()

### Vizualisation 3D

In [None]:
def plot_3d_scatter(A, elevation=30, azimuth=120):
    """ Create 3D scatterplot """
    
    maxpts=1000
    fig = plt.figure(1, figsize=(9, 9))
    ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=elevation, azim=azimuth)
    ax.set_xlabel('component 0')
    ax.set_ylabel('component 1')
    ax.set_zlabel('component 2')

    # plot subset of points
    rndpts = np.sort(np.random.choice(A.shape[0], min(maxpts,A.shape[0]), replace=False))
    coloridx = np.unique(A.iloc[rndpts]['class'], return_inverse=True)
    colors = coloridx[1] / len(coloridx[0])   
    
    sp = ax.scatter(A.iloc[rndpts,0], A.iloc[rndpts,1], A.iloc[rndpts,2]
               ,c=colors, cmap="jet", marker='o', alpha=0.6
               ,s=50, linewidths=0.8, edgecolor='#BBBBBB')

    plt.show()

In [None]:
from IPython.html.widgets import interactive, fixed
from mpl_toolkits.mplot3d import Axes3D

plotdims = 1000

svdcols = [c for c in new_X.columns if c[0] == 'c']
dfsvd = new_X[svdcols].iloc[:,:plotdims]
dfsvd['class'] = df['class']
interactive(plot_3d_scatter, A=fixed(dfsvd), elevation=30, azimuth=120)

### Some other stuff 

In [None]:
new_X.shape, df['categories'].shape

In [None]:
dfsvd['class'] = df['categories']
df = dfsvd

In [None]:
'Size of the dataframe: {}'.format(df.shape)

In [None]:
rndperm = np.random.permutation(df.shape[0])

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
pca_result = pca.fit_transform(df[svdcols].values)

df['pca-one'] = pca_result[:,0]
df['pca-two'] = pca_result[:,1] 
df['pca-three'] = pca_result[:,2]

'Explained variation per principal component: {}'.format(pca.explained_variance_ratio_)


In [None]:


chart = ggplot( df.loc[rndperm[:3000],:], aes(x='pca-one', y='pca-two', color='class') ) \
        + geom_point(size=25,alpha=0.8) \
        + ggtitle("First and Second Principal Components colored by digit")
chart

In [None]:
pca_50 = PCA(n_components=600)
pca_result_50 = pca_50.fit_transform(df[svdcols].values)

'Explained variation per principal component (PCA): {}'.format(np.sum(pca_50.explained_variance_ratio_))

In [None]:
import time

from sklearn.manifold import TSNE

n_sne = 5000

time_start = time.time()

tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_pca_results = tsne.fit_transform(pca_result_50[rndperm[:n_sne]])

In [None]:
df_tsne = None
df_tsne = df.loc[rndperm[:n_sne],:].copy()
df_tsne['x-tsne-pca'] = tsne_pca_results[:,0]
df_tsne['y-tsne-pca'] = tsne_pca_results[:,1]

chart = ggplot( df_tsne, aes(x='x-tsne-pca', y='y-tsne-pca', color='class') ) \
        + geom_point(size=20,alpha=0.1) \
        + ggtitle("tSNE dimensions colored by Digit (PCA)")
chart