In [1]:
import pandas as pd 
import sklearn
import numpy as np
import nltk
import re

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

from sklearn import tree
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_selection import chi2

from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline

from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest

from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import precision_recall_fscore_support


import gensim, logging
from gensim.models import Word2Vec
from scipy import sparse



In [2]:
def loadData(filePath="dataset.csv"):
    data=pd.read_csv("/home/administrator/data/categories-data/Train-Data/fps-with-cat-train.csv")
    data['CategoryFB'] = data['CategoryFB'].fillna(data['CategoryV2'])
    data['Description'] = data['Description'].fillna(data['Name'])
    return data["Tag"],data["Description"],data["CategoryV2"]

In [3]:
def preProcessing(features):
    num_descs = features.size
    clean_wordlist = []
    clean_descs = []
    stops = set(stopwords.words('english'))
    #letters_only = []
    for i in range( 0, num_descs):
        #letters_only = re.sub("[^a-zA-Z]", " ", features[i]) 
        words = features[i].lower().split()
        words = [w.lower() for w in words if not w in stops]  
        clean_wordlist.append(words)
        clean_descs.append(" ".join(words))
    return clean_descs, clean_wordlist

In [4]:
def getDTMByTFIDF(features,nfeatures):
    tfIdf_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1,3), min_df = 0, stop_words = 'english', max_features=nfeatures)
    dtm = tfIdf_vectorizer.fit_transform(features).toarray()
    return dtm,tfIdf_vectorizer

In [5]:
def featuresByChiSq(features,labels,nFeature=5000):
    chi2_model = SelectKBest(chi2,k=nFeature)
    dtm = chi2_model.fit_transform(features,labels)
    return dtm,chi2_model

def featuresByInformationGain(features,labels):
    treeCL = tree.DecisionTreeClassifier(criterion="entropy")
    treeCL = treeCL.fit(features,labels)
    transformed_features = SelectFromModel(treeCL,prefit=True).transform(features)
    return transformed_features

def featuresByLSA(features,ncomponents=100):
    svd = TruncatedSVD(n_components=ncomponents)
    normalizer =  Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)
    dtm_lsa = lsa.fit_transform(features)
    return dtm_lsa

In [6]:
def crossValidate(document_term_matrix,labels,classifier="SVM",nfold=10):
    clf = None
    precision = []
    recall = []
    fscore = []
    
    if classifier == "RF":
        clf = RandomForestClassifier()
    elif classifier == "NB":
        clf = MultinomialNB()
    elif classifier == "SVM":
        clf = LinearSVC()
    
    skf = StratifiedKFold(labels, n_folds=nfold)

    for train_index, test_index in skf:
        X_train, X_test = document_term_matrix[train_index], document_term_matrix[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        model = clf.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        p,r,f,s = precision_recall_fscore_support(y_test, y_pred, average='weighted')
        precision.append(p)
        recall.append(r)
        fscore.append(f)
        
    return round(np.mean(precision),3),round(np.mean(recall),3),round(np.mean(fscore),3)

In [7]:
tags, descs, labels = loadData()

In [8]:
processed_descs, processed_descs_wordlist = preProcessing(descs)

In [10]:
dtm,vect = getDTMByTFIDF(processed_descs,1000)

In [11]:
chisqDtm, chisqModel = featuresByChiSq(dtm,labels,1000)

In [12]:
precision, recall, fscore = crossValidate(chisqDtm,labels,"SVM",10)



  'precision', 'predicted', average, warn_for)


In [13]:
print precision, recall, fscore

0.871 0.843 0.848


In [14]:
print precision, recall, fscore

0.871 0.843 0.848


In [15]:
print precision, recall, fscore

0.871 0.843 0.848


In [16]:
print precision, recall, fscore

0.871 0.843 0.848


In [17]:
print precision, recall, fscore

0.871 0.843 0.848


In [20]:
precision, recall, fscore = crossValidate(chisqDtm,labels,"SVM",10)

In [21]:
print precision, recall, fscore

0.871 0.843 0.848


In [22]:
precision, recall, fscore = crossValidate(chisqDtm,labels,"RF",10)

  'recall', 'true', average, warn_for)


In [24]:
print precision, recall, fscore

0.874 0.84 0.849


In [32]:
features = featuresByInformationGain(dtm, labels)

In [34]:
precision, recall, fscore = crossValidate(features,labels,"RF",10)

In [35]:
print precision, recall, fscore

0.878 0.821 0.837


In [4]:
precision, recall, fscore = crossValidate(features,labels,"SVM",10)

NameError: name 'crossValidate' is not defined

In [5]:
print precision, recall, fscore

NameError: name 'precision' is not defined

In [40]:
dtm_lsa = featuresByLSA(features, 92)

In [41]:
precision, recall, fscore = crossValidate(dtm_lsa,labels,"SVM",10)

In [42]:
print precision, recall, fscore

0.872 0.822 0.835


In [43]:
precision, recall, fscore = crossValidate(dtm_lsa,labels,"RF",10)

In [44]:
print precision, recall, fscore

0.864 0.814 0.825


In [45]:
precision, recall, fscore = crossValidate(dtm_lsa,labels,"NB",10)

ValueError: Input X must be non-negative

In [47]:
features

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ..., 
       [ 0.        ,  0.61392957,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [48]:
dtm_lsa

array([[  4.85548150e-03,   3.82477903e-03,   8.91496051e-03, ...,
         -2.44857612e-03,  -1.42457299e-03,  -2.42991661e-03],
       [  8.11686333e-04,   8.66736200e-03,   7.13570984e-02, ...,
          1.15673899e-04,   1.17445838e-04,   8.05562629e-05],
       [  3.39246104e-02,  -3.15091497e-02,  -5.82961762e-01, ...,
         -1.02275842e-02,  -3.04194750e-02,  -5.94205376e-03],
       ..., 
       [  4.52296482e-05,   3.73206434e-04,   3.48034048e-04, ...,
         -2.11837322e-04,   7.02490323e-05,   1.97894171e-04],
       [  1.04404031e-02,   1.78252776e-03,   8.39496288e-03, ...,
          7.78203450e-03,  -2.25447559e-02,  -1.43523961e-03],
       [  3.05359938e-05,   2.72228853e-04,   3.96066487e-04, ...,
          2.08198363e-03,   1.75077965e-03,   7.11148713e-04]])

In [26]:
train_descs, test_descs = train_test_split(descs, test_size=0.1, 
                                           random_state=42)

In [27]:
from sklearn.cross_validation import train_test_split

In [28]:
train_descs, test_descs = train_test_split(descs, test_size=0.1, 
                                           random_state=42)

In [29]:
train_labels, test_labels = train_test_split(labels, test_size=0.1, 
                                           random_state=42)

In [30]:
train_descs = train_descs.reset_index(drop=True)

In [31]:
test_descs = test_descs.reset_index(drop=True)

In [32]:
train_labels = train_labels.reset_index(drop=True)

In [33]:
test_labels = test_labels.reset_index(drop=True)

In [34]:
train_labels[345]

'ELECTRONICS'

In [35]:
len(test_labels)

3452

In [36]:
processed__train_descs, processed_train_descs_wordlist = preProcessing(train_descs)

In [37]:
processed_test_descs, processed_test_descs_wordlist = preProcessing(test_descs)

In [38]:
len(processed_test_descs)

3452

In [39]:
dtm_train,vect_train = getDTMByTFIDF(processed__train_descs,2000)

In [40]:
chisqDtmTrain, chisqModelTrain = featuresByChiSq(dtm_train,train_labels,2000)

In [41]:
clf = LinearSVC()

In [42]:
chisqDtmTrain

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [43]:
model = clf.fit(chisqDtmTrain, train_labels)

In [44]:
dtm_test,vect_test = getDTMByTFIDF(processed_test_descs,2000)

In [45]:
chisqDtmTest, chisqModelTest = featuresByChiSq(dtm_test,test_labels,2000)

In [46]:
y_pred = model.predict(chisqDtmTest)

In [47]:
p,r,f,s = precision_recall_fscore_support(test_labels, y_pred, average='weighted')

In [48]:
print p,r,f,s

0.268109557822 0.25955967555 0.2519524902 None


In [49]:
outputv2=pd.DataFrame( data={"Desc":processed_test_descs,"Prediction"
:y_pred,"CategoryV2":test_labels} )

In [50]:
outputv2.to_csv( "svm-prediction-v3.csv", index=False, quoting=3, escapechar= '\\' )

In [51]:
len(outputv2)

3452

In [52]:
labels.value_counts()

MEDICAL                    4413
ELECTRONICS                4372
EDUCATION                  3986
TOURISM                    3677
FASHION                    3102
HOTEL                      3025
BEAUTY & SPA               1613
FURNITURE                  1545
HARDWARE & SANITARYWARE    1158
FOOD & BEVERAGES           1130
JEWELLERY                  1073
REAL ESTATE                 824
HOME MAINTENANCE            805
SPORTS                      719
SECURITY                    665
PHOTOGRAPHY                 615
EVENT                       181
AUTOMOTIVE                  169
BLOGS                       155
MANUFACTURING               155
KIDS                        149
HEALTH                      115
INTERIOR DESIGN              92
PROMOTION                    90
RETAIL                       67
INDIVIDUAL                   66
CHEMICALS                    53
HOME CARE                    50
HOUSING                      39
ARTS                         38
ARCHITECTURE                 38
ENTERTAI

In [53]:
from sklearn.metrics import accuracy_score

In [54]:
accuracy_score(test_labels, y_pred)

0.25955967555040554

In [55]:
from sklearn.metrics import classification_report
print classification_report(test_labels, y_pred)

                         precision    recall  f1-score   support

           ARCHITECTURE       0.00      0.00      0.00         3
                   ARTS       0.00      0.00      0.00         2
             AUTOMOTIVE       0.00      0.00      0.00        17
           BEAUTY & SPA       0.04      0.01      0.02       171
                  BLOGS       0.00      0.00      0.00        13
               CATERING       0.00      0.00      0.00         1
              CHEMICALS       0.00      0.00      0.00         7
           CONSTRUCTION       0.00      0.00      0.00         2
              EDUCATION       0.37      0.48      0.42       400
            ELECTRONICS       0.50      0.51      0.51       396
          ENTERTAINMENT       0.00      0.00      0.00         7
                  EVENT       0.00      0.00      0.00         9
                FASHION       0.15      0.11      0.12       320
                FLORIST       0.00      0.00      0.00         4
       FOOD & BEVERAGES 

In [56]:
from sklearn.metrics import confusion_matrix

In [None]:
print confusion_matrix(test_labels, y_pred, test_labels)

In [64]:
import numpy
numpy.set_printoptions(threshold=numpy.nan)

In [None]:
print confusion_matrix(test_labels, y_pred, test_labels)