In [2]:
import nltk, re, glob
import numpy as np
from nltk.tokenize import casual_tokenize
from nltk.tokenize.casual import _replace_html_entities
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
#from sklearn.tree import DecisionTreeRegression
from sklearn import tree

from sklearn import svm
from sklearn.naive_bayes import GaussianNB
import itertools
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import VarianceThreshold
from nltk.stem.lancaster import LancasterStemmer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction import text 
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import LinearSVC

lancaster_stemmer = LancasterStemmer()

lemmatizer = WordNetLemmatizer()

EMOTICONS = r"""
    (?:
      [<>]?
      [:;=8]                     # eyes
      [\-o\*\']?                 # optional nose
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
      |
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
      [\-o\*\']?                 # optional nose
      [:;=8]                     # eyes
      [<>]?
      |
      <3                         # heart
    )"""

# URL pattern due to John Gruber, modified by Tom Winzig. See
# https://gist.github.com/winzig/8894715

URLS = r"(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:[a-z]{2,13})(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:[a-z]{2,13})\b/?(?!@))))"


def clean_first(doc):
    # make everything lower case 
    words = doc.lower()   
    # Replace  URLs but some URLs becomes "URLURL"
    words = re.sub(URLS,"URL", words)
    # Replace Numbers 
    # (But we get stuff like 'NUM-NUM' 'NUM.NUMNUM' 'NUMNUM' 'NUMNUM-NUMNUM' 'NUMNUMNUM'
    # 'NUMNUMNUMNUM' 'NUMNUMk' 'NUMnd' 'NUMwhy')
    #cleaned = re.sub('\d+', "NUM", words)
    cleaned = re.sub("[^a-zA-Z\s\W]+", "", words) #replace NUM with nothing
   # cleaned = re.sub(r"([\w\d]+\.)([\w\d]+)", r"\1 \2", words)
   # print(cleaned)
    return(cleaned)
   
def lemmatize(doc):   
    lemma_list = []
    wnl = WordNetLemmatizer()
    # cleans the document
    cleaned_doc = clean_first(doc)
    # split into sentences
    sent_text = nltk.sent_tokenize(cleaned_doc)
    for line in sent_text:
    # lemmatize with casual_tokenizer from http://www.nltk.org/_modules/nltk/tokenize/casual.html#TweetTokenizer (we can use something else)
        lemma_list1 = [wnl.lemmatize(i,j[0].lower()) if j[0].lower() in ['a','n','v'] else wnl.lemmatize(i) for i,j in pos_tag(casual_tokenize(line))]
        lemma_list = lemma_list + lemma_list1
    return(lemma_list)

def stemmer(doc):
    stem_list= []
    l_stemmer = LancasterStemmer()
    cleaned_doc = clean_first(doc)
    sent_text = nltk.sent_tokenize(cleaned_doc)
    for line in sent_text:
        for word in nltk.word_tokenize(line):
            stem_list.append(l_stemmer.stem(word))
    return(stem_list)

def lem_stemmer(doc):
    stem_list=[]
    lemma = lemmatize(doc)
    l_stemmer = LancasterStemmer()
    for word in lemma:
        stem_list.append(l_stemmer.stem(word))
    return(stem_list)
    
def make_combs():
    combs = []
    poslist=[ 'j', 'n', 'r','v']
    #'CC', 'DT', 'EX',  'UH' 'TO' , 'w'  'i',  'm',
    for L in range(0, len(poslist)+1): 
        for subset in itertools.combinations(poslist, L): # get all the possible combination of POSs
            if subset != ():  
                if len(subset) < 5:
                    combs.append(subset) # append in the list "comb"
    return combs
    #print(combs[1])
    
def get_selected_lemma(doc,comb):
   # combs = make_combs()
    lemma_list = []
    wnl = WordNetLemmatizer()        
    # cleans the document
    cleaned_doc = clean_first(doc)
    # split into sentences
    sent_text = nltk.sent_tokenize(cleaned_doc)
    for line in sent_text:
    # lemmatize with casual_tokenizer from http://www.nltk.org/_modules/nltk/tokenize/casual.html#TweetTokenizer (we can use something else)
       # lemma_list1 = [wnl.lemmatize(i,j[0].lower()) if j[0].lower() in ['a','n','v'] else wnl.lemmatize(i) for i,j in pos_tag(casual_tokenize(line))]
        for j in pos_tag(casual_tokenize(line)):
           
            if str(j[1][0].lower()) in comb:
                if j[1][0].lower() in ['j']: #,'n','v']:
                    lemma = wnl.lemmatize(j[0],'a')
                elif j[1][0].lower() in ['n','v']:
                    lemma = wnl.lemmatize(j[0],j[1][0].lower())
                else:
                    lemma = wnl.lemmatize(j[0])
                lemma_list.append(lemma)
                
    return(lemma_list)


# lemmatize and make n-grams
def make_ngrams(text,n):
    ngram_list = []
    text = re.sub(r"([\w\d]+[\.!?]+)([\w\d]+)", r"\1 \2", text)
    sent_text = nltk.sent_tokenize(text) 
    #print(sent_text)
    for sentence in sent_text:
        lemma_list = lemmatize(sentence) 
        #print(lemma_list)
        for i in range(len(lemma_list) - (n - 1)):
            ngram = lemma_list[i:i+n]
            stringngram = " ".join(ngram) #turn the list into string
            ngram_list.append(stringngram) #append the n-gram of words into a list
    return ngram_list

def make_ChNgrams(text, n): 
    ngram_list=[]
    text = re.sub(r"([\w\d]+[\.!?]+)([\w\d]+)", r"\1 \2", text)
    sent_text = nltk.sent_tokenize(text) 
    for sentence in sent_text:
        #sentence = sentence.strip([\n\t])
        new_sent = re.sub("/\s+/S","",sentence) # remove spaces
        for i in range(len(new_sent) - (n-1)):
            ngram=new_sent[i:i+n]
            ngram_list.append(ngram)
    return ngram_list 
    

# Feature: tfidf 

In [34]:
stopset = set(nltk.corpus.stopwords.words('english'))
stopset.add("\'")
stopset.add("\"")

reddit_train = load_files("/media/mh/EF2A-B9DB/Reddit_Data/Train/")
X, y = reddit_train.data, reddit_train.target
# Check the data


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

print("samples per class: {}".format(np.bincount(y_train)))
print("Data: {}".format(np.bincount(y_test)))
#print(X_train.shape)



#lemma_vect = CountVectorizer(tokenizer=lambda doc: get_selected_lemma(doc,comb), stop_words =stopset, min_df=2)
vect = TfidfVectorizer(analyzer='word', tokenizer=lemmatize, sublinear_tf=True, min_df=2,stop_words=stopset)
X_train = vect.fit_transform(X_train)
print('X_train.shape:\n{}'.format(X_train.shape))


# To check features (the first 200)
#    feature_names_lemma = np.array(lemma_vect.get_feature_names())
#    print(feature_names_lemma[2000:2500])

max_value = X_train.max(axis=0).toarray().ravel()
sorted_by_tfidf = max_value.argsort()
feature_names=np.array(vect.get_feature_names())
print("features with highest tfidf:\n{}".format(feature_names[sorted_by_tfidf[-300:]]))
print("features with lowest tfidf:\n{}".format(feature_names[sorted_by_tfidf[:300]]))



mnNB = MultinomialNB()


print("\nno feature selection\n")

X_test = vect.transform(X_test) 
#        print('X.shape:\n{}'.format(X_train.shape))

predict = mnNB.fit(X_train,y_train).predict(X_test)


predscore = mnNB.fit(X_train,y_train).score(X_test,y_test)
print('NB Test {:.3f}'.format(predscore))

print(classification_report(y_test, predict, target_names=['Dep','ND']))
#  print(confusion_matrix(y_test, y_pred, labels=range(n_classes)))

predict = svm.SVC().fit(X_train,y_train).predict(X_test)
predscore = svm.SVC().fit(X_train,y_train).score(X_test,y_test)
print("SVM Test score: {:.3f}".format(predscore))
print(classification_report(y_test, predict, target_names=['Dep','ND']))

predict = LogisticRegression().fit(X_train,y_train).predict(X_test)
predscore = LogisticRegression().fit(X_train,y_train).score(X_test,y_test)
print("LR Test score: {:.3f}".format(predscore))
print(classification_report(y_test, predict, target_names=['Dep','ND']))


pca = TruncatedSVD(n_components=100, random_state=0).fit(X_train)
#logistic = LogisticRegression()
# pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)])
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
print("X_train_pca.shape: {}".format(X_train_pca.shape))



predscore = GaussianNB().fit(X_train_pca,y_train).score(X_test_pca,y_test)
print('GaussianNB Test {:.3f}'.format(predscore))

print(classification_report(y_test, predict, target_names=['Dep','ND']))
# print(confusion_matrix(y_test, y_pred, labels=range(n_classes)))

predict = svm.SVC().fit(X_train_pca,y_train).predict(X_test_pca)
predscore = svm.SVC().fit(X_train_pca,y_train).score(X_test_pca,y_test)
print("SVM Test score: {:.3f}".format(predscore))
print(classification_report(y_test, predict, target_names=['Dep','ND']))

predict = LogisticRegression().fit(X_train_pca,y_train).predict(X_test_pca)
predscore = LogisticRegression().fit(X_train_pca,y_train).score(X_test_pca,y_test)
print("LR Test score: {:.3f}".format(predscore))
print(classification_report(y_test, predict, target_names=['Dep','ND']))

"""
print('feature selection\n')
lr = LogisticRegression(C=1, penalty='l1').fit(X_train, y_train)
model = SelectFromModel(lr, prefit=True)
X_new = model.transform(X_train)
print(X_new.shape)

#print(X_new.get_feature_names())
X_test_1= model.transform(X_test)
#print("X_test:\n{}".format(repr(X_test_1)))

predscore = mnNB.fit(X_new,y_train).score(X_test_1,y_test)
print('NB Test {:.3f}'.format(predscore))

predscore = svm.SVC().fit(X_new,y_train).score(X_test_1,y_test)
print("SVM Test score: {:.3f}".format(predscore))

predscore = LogisticRegression().fit(X_new, y_train).score(X_test_1,y_test)
print("LR Test score: {:.3f}".format(predscore))
"""

samples per class: [759 840]
Data: [255 278]
X_train.shape:
(1599, 29682)
features with highest tfidf:
['species' 'tru' 'kara' 'sx' 'theydidthefuckyou' 'yoga' '\u200d' 'schwartz'
 'alexa' '♥' 'hawaii' 'nga' 'sht' 'grassroots' 'taskbar' 'pokeball'
 'piercer' 'toefl' 'photojournalist' "juice's" 'vector' 'puzzle' 'xda'
 'repost' 'wholesomeness' 'violent' '🤗' 'trader' 'cheapo' 'gd' 'chester'
 "o'reilly" 'cam' 'naomi' 'trusted' 'darvish' 'otp' 'pounder' 'phish'
 'snort' 'fv' 'opium' '<' 'lsu' 'brothel' 'clause' 'porygon' 'kadabra'
 'cleft' 'dbd' 'deoderant' 'epileptic' 'jinder' 'stamford' 'sperm' 'porky'
 'ahaha' '️' 'mastery' 'nitro' 'coke' 'vox' 'neighbourhood' 'skaven'
 'jansen' 'dota' 'goku' '(:' 'nitros' 'fatale' 'deposit' 'trata' 'batch'
 '✊' 'jailbreak' 'orthos' 'tortoise' 'illenium' 'bu' 'messi' 'petrol' '👈'
 'hatch' 'fcking' '🙈' 'warranty' 'sapphire' 'tarantula' 'router' 'qc'
 'usersimulator' 'wenger' 'lap' 'feng' '🏽' 'beany' 'skype' 'verry'
 'registration' 'infp' 'dxm' 'mio' '👍' '

  'precision', 'predicted', average, warn_for)


LR Test score: 0.704
             precision    recall  f1-score   support

        Dep       0.77      0.55      0.64       255
         ND       0.67      0.85      0.75       278

avg / total       0.72      0.70      0.70       533

X_train_pca.shape: (1599, 100)
GaussianNB Test 0.540
             precision    recall  f1-score   support

        Dep       0.77      0.55      0.64       255
         ND       0.67      0.85      0.75       278

avg / total       0.72      0.70      0.70       533

SVM Test score: 0.522
             precision    recall  f1-score   support

        Dep       0.00      0.00      0.00       255
         ND       0.52      1.00      0.69       278

avg / total       0.27      0.52      0.36       533

LR Test score: 0.698
             precision    recall  f1-score   support

        Dep       0.75      0.55      0.64       255
         ND       0.67      0.83      0.74       278

avg / total       0.71      0.70      0.69       533



'\nprint(\'feature selection\n\')\nlr = LogisticRegression(C=1, penalty=\'l1\').fit(X_train, y_train)\nmodel = SelectFromModel(lr, prefit=True)\nX_new = model.transform(X_train)\nprint(X_new.shape)\n\n#print(X_new.get_feature_names())\nX_test_1= model.transform(X_test)\n#print("X_test:\n{}".format(repr(X_test_1)))\n\npredscore = mnNB.fit(X_new,y_train).score(X_test_1,y_test)\nprint(\'NB Test {:.3f}\'.format(predscore))\n\npredscore = svm.SVC().fit(X_new,y_train).score(X_test_1,y_test)\nprint("SVM Test score: {:.3f}".format(predscore))\n\npredscore = LogisticRegression().fit(X_new, y_train).score(X_test_1,y_test)\nprint("LR Test score: {:.3f}".format(predscore))\n'

In [41]:
pca = TruncatedSVD(n_components=900, random_state=42).fit(X_train)
#logistic = LogisticRegression()
# pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)])
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
print("X_train_pca.shape: {}".format(X_train_pca.shape))



predscore = GaussianNB().fit(X_train_pca,y_train).score(X_test_pca,y_test)
print('GaussianNB Test {:.3f}'.format(predscore))

print(classification_report(y_test, predict, target_names=['Dep','ND']))
# print(confusion_matrix(y_test, y_pred, labels=range(n_classes)))

predict = svm.SVC().fit(X_train_pca,y_train).predict(X_test_pca)
predscore = svm.SVC().fit(X_train_pca,y_train).score(X_test_pca,y_test)
print("SVM Test score: {:.3f}".format(predscore))
print(classification_report(y_test, predict, target_names=['Dep','ND']))

predict = LogisticRegression().fit(X_train_pca,y_train).predict(X_test_pca)
predscore = LogisticRegression().fit(X_train_pca,y_train).score(X_test_pca,y_test)
print("LR Test score: {:.3f}".format(predscore))
print(classification_report(y_test, predict, target_names=['Dep','ND']))


X_train_pca.shape: (1599, 900)
GaussianNB Test 0.480
             precision    recall  f1-score   support

        Dep       0.77      0.57      0.65       255
         ND       0.68      0.85      0.75       278

avg / total       0.72      0.71      0.71       533

SVM Test score: 0.522
             precision    recall  f1-score   support

        Dep       0.00      0.00      0.00       255
         ND       0.52      1.00      0.69       278

avg / total       0.27      0.52      0.36       533



  'precision', 'predicted', average, warn_for)


LR Test score: 0.700
             precision    recall  f1-score   support

        Dep       0.76      0.55      0.64       255
         ND       0.67      0.84      0.74       278

avg / total       0.71      0.70      0.69       533



In [None]:
Selected Lemmas

In [51]:
stopset = set(nltk.corpus.stopwords.words('english'))
stopset.add("\'")
stopset.add("\"")

reddit_train = load_files("/media/mh/EF2A-B9DB/Reddit_Data/Train/")
X, y = reddit_train.data, reddit_train.target
# Check the data


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

print("samples per class: {}".format(np.bincount(y_train)))
print("Data: {}".format(np.bincount(y_test)))
#print(X_train.shape)


combs = make_combs()
combs = [('j')]
for comb in combs:
    print(comb)
    if len(comb) < 3:
        vect = CountVectorizer(tokenizer=lambda doc: get_selected_lemma(doc,comb), stop_words =stopset, min_df=2)
       
        X_train = vect.fit_transform(X_train)
        print('X_train.shape:\n{}'.format(X_train.shape))

        mnNB = MultinomialNB()


        print("\nno feature selection\n")

        X_test = vect.transform(X_test) 
        #        print('X.shape:\n{}'.format(X_train.shape))

        predict = mnNB.fit(X_train,y_train).predict(X_test)


        predscore = mnNB.fit(X_train,y_train).score(X_test,y_test)
        print('NB Test {:.3f}'.format(predscore))

        print(classification_report(y_test, predict, target_names=['Dep','ND']))
        #  print(confusion_matrix(y_test, y_pred, labels=range(n_classes)))

        predict = svm.SVC().fit(X_train,y_train).predict(X_test)
        predscore = svm.SVC().fit(X_train,y_train).score(X_test,y_test)
        print("SVM Test score: {:.3f}".format(predscore))
        print(classification_report(y_test, predict, target_names=['Dep','ND']))

        predict = LogisticRegression().fit(X_train,y_train).predict(X_test)
        predscore = LogisticRegression().fit(X_train,y_train).score(X_test,y_test)
        print("LR Test score: {:.3f}".format(predscore))
        print(classification_report(y_test, predict, target_names=['Dep','ND']))

        print('feature selection\n')
        lr = LogisticRegression(C=1, penalty='l1').fit(X_train, y_train)
        model = SelectFromModel(lr, prefit=True)
        X_new = model.transform(X_train)
        print(X_new.shape)

        #print(X_new.get_feature_names())
        X_test_1= model.transform(X_test)
        #print("X_test:\n{}".format(repr(X_test_1)))

        predict = mnNB.fit(X_train,y_train).predict(X_test)
        predscore = mnNB.fit(X_new,y_train).score(X_test_1,y_test)
        print('NB Test {:.3f}'.format(predscore))
        print(classification_report(y_test, predict, target_names=['Dep','ND']))

        predscore = svm.SVC().fit(X_new,y_train).score(X_test_1,y_test)
        print("SVM Test score: {:.3f}".format(predscore))
        predict = svm.SVC().fit(X_new,y_train).predict(X_test)
        print(classification_report(y_test, predict, target_names=['Dep','ND']))

        predscore = LogisticRegression().fit(X_new, y_train).score(X_test_1,y_test)
        print("LR Test score: {:.3f}".format(predscore))
        predict = LogisticRegression().fit(X_new,y_train).predict(X_test)
        print(classification_report(y_test, predict, target_names=['Dep','ND']))
        
        print('\n')
        
    
        pca = TruncatedSVD(n_components=1000, random_state=0).fit(X_train)
        #logistic = LogisticRegression()
       # pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)])
        X_train_pca = pca.transform(X_train)
        X_test_pca = pca.transform(X_test)
        print("X_train_pca.shape: {}".format(X_train_pca.shape))
        
        
        
        predscore = GaussianNB().fit(X_train_pca,y_train).score(X_test_pca,y_test)
        print('GaussianNB Test {:.3f}'.format(predscore))
        
        print(classification_report(y_test, predict, target_names=['Dep','ND']))
       # print(confusion_matrix(y_test, y_pred, labels=range(n_classes)))

        predict = svm.SVC().fit(X_train_pca,y_train).predict(X_test_pca)
        predscore = svm.SVC().fit(X_train_pca,y_train).score(X_test_pca,y_test)
        print("SVM Test score: {:.3f}".format(predscore))
        print(classification_report(y_test, predict, target_names=['Dep','ND']))

        predict = LogisticRegression().fit(X_train_pca,y_train).predict(X_test_pca)
        predscore = LogisticRegression().fit(X_train_pca,y_train).score(X_test_pca,y_test)
        print("LR Test score: {:.3f}".format(predscore))
        print(classification_report(y_test, predict, target_names=['Dep','ND']))


samples per class: [759 840]
Data: [255 278]
j
X_train.shape:
(1599, 9161)

no feature selection

NB Test 0.642
             precision    recall  f1-score   support

        Dep       0.65      0.56      0.60       255
         ND       0.64      0.72      0.68       278

avg / total       0.64      0.64      0.64       533

SVM Test score: 0.570
             precision    recall  f1-score   support

        Dep       0.70      0.18      0.28       255
         ND       0.55      0.93      0.69       278

avg / total       0.62      0.57      0.50       533

LR Test score: 0.608
             precision    recall  f1-score   support

        Dep       0.59      0.58      0.58       255
         ND       0.62      0.64      0.63       278

avg / total       0.61      0.61      0.61       533

feature selection

(1599, 781)
NB Test 0.612
             precision    recall  f1-score   support

        Dep       0.65      0.56      0.60       255
         ND       0.64      0.72      0.68      

ValueError: X.shape[1] = 9161 should be equal to 781, the number of features at training time

In [50]:
        pca = TruncatedSVD(n_components=100, random_state=0).fit(X_train)
        #logistic = LogisticRegression()
       # pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)])
        X_train_pca = pca.transform(X_train)
        X_test_pca = pca.transform(X_test)
        print("X_train_pca.shape: {}".format(X_train_pca.shape))
        
        
        
        predscore = GaussianNB().fit(X_train_pca,y_train).score(X_test_pca,y_test)
        print('GaussianNB Test {:.3f}'.format(predscore))
        
        print(classification_report(y_test, predict, target_names=['Dep','ND']))
       # print(confusion_matrix(y_test, y_pred, labels=range(n_classes)))

        predict = svm.SVC().fit(X_train_pca,y_train).predict(X_test_pca)
        predscore = svm.SVC().fit(X_train_pca,y_train).score(X_test_pca,y_test)
        print("SVM Test score: {:.3f}".format(predscore))
        print(classification_report(y_test, predict, target_names=['Dep','ND']))

        predict = LogisticRegression().fit(X_train_pca,y_train).predict(X_test_pca)
        predscore = LogisticRegression().fit(X_train_pca,y_train).score(X_test_pca,y_test)
        print("LR Test score: {:.3f}".format(predscore))
        print(classification_report(y_test, predict, target_names=['Dep','ND']))

X_train_pca.shape: (1599, 100)
GaussianNB Test 0.540
             precision    recall  f1-score   support

        Dep       0.80      0.02      0.03       255
         ND       0.52      1.00      0.69       278

avg / total       0.66      0.53      0.37       533

SVM Test score: 0.574
             precision    recall  f1-score   support

        Dep       0.64      0.25      0.36       255
         ND       0.56      0.87      0.68       278

avg / total       0.60      0.57      0.53       533

LR Test score: 0.627
             precision    recall  f1-score   support

        Dep       0.62      0.55      0.59       255
         ND       0.63      0.69      0.66       278

avg / total       0.63      0.63      0.62       533



# char-ngrams

In [53]:


reddit_train = load_files("/media/mh/EF2A-B9DB/Reddit_Data/Train/")
X, y = reddit_train.data, reddit_train.target
# Check the data


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

print("samples per class: {}".format(np.bincount(y_train)))
print("Data: {}".format(np.bincount(y_test)))
#print(X_train.shape)



for n in range(6,7):
    print(n)
    if n !=0:
        reddit_train = load_files("/media/mh/EF2A-B9DB/Reddit_Data/Train/")
        X, y = reddit_train.data, reddit_train.target
        # Check the data


        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

        print("samples per class: {}".format(np.bincount(y_train)))
        print("Data: {}".format(np.bincount(y_test)))
        #print(X_train.shape)


        vect = CountVectorizer(analyzer='char', ngram_range = (n, n),encoding='utf-8')
       
        X_train = vect.fit_transform(X_train)
        print('X_train.shape:\n{}'.format(X_train.shape))

        mnNB = MultinomialNB()


        print("\nno feature selection\n")

        X_test = vect.transform(X_test) 
        #        print('X.shape:\n{}'.format(X_train.shape))

        predict = mnNB.fit(X_train,y_train).predict(X_test)


        predscore = mnNB.fit(X_train,y_train).score(X_test,y_test)
        print('NB Test {:.3f}'.format(predscore))

        print(classification_report(y_test, predict, target_names=['Dep','ND']))
        #  print(confusion_matrix(y_test, y_pred, labels=range(n_classes)))

        predict = LinearSVC().fit(X_train,y_train).predict(X_test)
        predscore = LinearSVC().fit(X_train,y_train).score(X_test,y_test)
        print("SVM Test score: {:.3f}".format(predscore))
        print(classification_report(y_test, predict, target_names=['Dep','ND']))

        predict = LogisticRegression().fit(X_train,y_train).predict(X_test)
        predscore = LogisticRegression().fit(X_train,y_train).score(X_test,y_test)
        print("LR Test score: {:.3f}".format(predscore))
        print(classification_report(y_test, predict, target_names=['Dep','ND']))
        """
        print('feature selection\n')
        lr = LogisticRegression(C=1, penalty='l1').fit(X_train, y_train)
        model = SelectFromModel(lr, prefit=True)
        X_new = model.transform(X_train)
        print(X_new.shape)

        #print(X_new.get_feature_names())
        X_test_1= model.transform(X_test)
        #print("X_test:\n{}".format(repr(X_test_1)))

        predscore = mnNB.fit(X_new,y_train).score(X_test_1,y_test)
        print('NB Test {:.3f}'.format(predscore))

        predscore = LinearSVC().fit(X_new,y_train).score(X_test_1,y_test)
        print("SVM Test score: {:.3f}".format(predscore))

        predscore = LogisticRegression().fit(X_new, y_train).score(X_test_1,y_test)
        print("LR Test score: {:.3f}".format(predscore))
        
        """ 
    
        pca = TruncatedSVD(n_components=100, random_state=0).fit(X_train)
        #logistic = LogisticRegression()
       # pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)])
        X_train_pca = pca.transform(X_train)
        X_test_pca = pca.transform(X_test)
        print("X_train_pca.shape: {}".format(X_train_pca.shape))
        
        
        
        predscore = GaussianNB().fit(X_train_pca,y_train).score(X_test_pca,y_test)
        print('GaussianNB Test {:.3f}'.format(predscore))
        
        print(classification_report(y_test, predict, target_names=['Dep','ND']))
       # print(confusion_matrix(y_test, y_pred, labels=range(n_classes)))

        predict = LinearSVC().fit(X_train_pca,y_train).predict(X_test_pca)
        predscore = LinearSVC().fit(X_train_pca,y_train).score(X_test_pca,y_test)
        print("SVM Test score: {:.3f}".format(predscore))
        print(classification_report(y_test, predict, target_names=['Dep','ND']))

        predict = LogisticRegression().fit(X_train_pca,y_train).predict(X_test_pca)
        predscore = LogisticRegression().fit(X_train_pca,y_train).score(X_test_pca,y_test)
        print("LR Test score: {:.3f}".format(predscore))
        print(classification_report(y_test, predict, target_names=['Dep','ND']))

samples per class: [759 840]
Data: [255 278]
6
samples per class: [759 840]
Data: [255 278]
X_train.shape:
(1599, 1806976)

no feature selection

NB Test 0.662
             precision    recall  f1-score   support

        Dep       0.77      0.42      0.54       255
         ND       0.62      0.89      0.73       278

avg / total       0.70      0.66      0.64       533

SVM Test score: 0.687
             precision    recall  f1-score   support

        Dep       0.68      0.61      0.64       255
         ND       0.67      0.74      0.70       278

avg / total       0.68      0.68      0.67       533

LR Test score: 0.687
             precision    recall  f1-score   support

        Dep       0.70      0.61      0.65       255
         ND       0.68      0.76      0.72       278

avg / total       0.69      0.69      0.68       533



MemoryError: 

# word n-grams

In [None]:
#
#Classification using scikit-learn
#

#load data  (change the file path)
reddit_train = load_files("/media/mh/EF2A-B9DB/Reddit_Data/Train/")
X, y = reddit_train.data, reddit_train.target
# Check the data
print("Data: {}".format(np.bincount(y)))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

    
#vect = CountVectorizer(tokenizer=lemmatize,  stop_words =stopset, ngram_range = (3, 3) )

vect = CountVectorizer(tokenizer=lambda doc: make_ngrams(doc,1), min_df=2)
X_train = vect.fit_transform(X_train)
X_test = vect.transform(X_test)
print('X_train_charngram.shape:\n{}'.format(X_train.shape))


# To check features (the first 100)
#feature_names = np.array(vect.get_feature_names())
#print(feature_names[1000:1500])

print("samples per class: {}".format(np.bincount(y_train)))
print("Data: {}".format(np.bincount(y_test)))
#print(X_train.shape)
print('X_train.shape:\n{}'.format(X_train.shape))

mnNB = MultinomialNB()


print("\nno feature selection\n")

 
#        print('X.shape:\n{}'.format(X_train.shape))

predict = mnNB.fit(X_train,y_train).predict(X_test)


predscore = mnNB.fit(X_train,y_train).score(X_test,y_test)
print('NB Test {:.3f}'.format(predscore))

print(classification_report(y_test, predict, target_names=['Dep','ND']))
#  print(confusion_matrix(y_test, y_pred, labels=range(n_classes)))

predict = LinearSVC().fit(X_train,y_train).predict(X_test)
predscore = LinearSVC().fit(X_train,y_train).score(X_test,y_test)
print("SVM Test score: {:.3f}".format(predscore))
print(classification_report(y_test, predict, target_names=['Dep','ND']))

predict = LogisticRegression().fit(X_train,y_train).predict(X_test)
predscore = LogisticRegression().fit(X_train,y_train).score(X_test,y_test)
print("LR Test score: {:.3f}".format(predscore))
print(classification_report(y_test, predict, target_names=['Dep','ND']))
"""
print('feature selection\n')
lr = LogisticRegression(C=1, penalty='l1').fit(X_train, y_train)
model = SelectFromModel(lr, prefit=True)
X_new = model.transform(X_train)
print(X_new.shape)

#print(X_new.get_feature_names())
X_test_1= model.transform(X_test)
#print("X_test:\n{}".format(repr(X_test_1)))

predscore = mnNB.fit(X_new,y_train).score(X_test_1,y_test)
print('NB Test {:.3f}'.format(predscore))

predscore = LinearSVC().fit(X_new,y_train).score(X_test_1,y_test)
print("SVM Test score: {:.3f}".format(predscore))

predscore = LogisticRegression().fit(X_new, y_train).score(X_test_1,y_test)
print("LR Test score: {:.3f}".format(predscore))

""" 

pca = TruncatedSVD(n_components=1000, random_state=0).fit(X_train)
#logistic = LogisticRegression()
# pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)])
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
print("X_train_pca.shape: {}".format(X_train_pca.shape))



predscore = GaussianNB().fit(X_train_pca,y_train).score(X_test_pca,y_test)
print('GaussianNB Test {:.3f}'.format(predscore))

print(classification_report(y_test, predict, target_names=['Dep','ND']))
# print(confusion_matrix(y_test, y_pred, labels=range(n_classes)))

predict = LinearSVC().fit(X_train_pca,y_train).predict(X_test_pca)
predscore = LinearSVC().fit(X_train_pca,y_train).score(X_test_pca,y_test)
print("SVM Test score: {:.3f}".format(predscore))
print(classification_report(y_test, predict, target_names=['Dep','ND']))

predict = LogisticRegression().fit(X_train_pca,y_train).predict(X_test_pca)
predscore = LogisticRegression().fit(X_train_pca,y_train).score(X_test_pca,y_test)
print("LR Test score: {:.3f}".format(predscore))
print(classification_report(y_test, predict, target_names=['Dep','ND']))


Data: [1012 1118]
X_train_charngram.shape:
(1597, 30126)
samples per class: [759 838]
Data: [253 280]
X_train.shape:
(1597, 30126)

no feature selection

NB Test 0.670
             precision    recall  f1-score   support

        Dep       0.72      0.50      0.59       253
         ND       0.65      0.82      0.72       280

avg / total       0.68      0.67      0.66       533



In [57]:
predict = LinearSVC().fit(X_train,y_train).predict(X_test)
predscore = LinearSVC().fit(X_train,y_train).score(X_test,y_test)
print("SVM Test score: {:.3f}".format(predscore))
print(classification_report(y_test, predict, target_names=['Dep','ND']))

predict = LogisticRegression().fit(X_train,y_train).predict(X_test)
predscore = LogisticRegression().fit(X_train,y_train).score(X_test,y_test)
print("LR Test score: {:.3f}".format(predscore))
print(classification_report(y_test, predict, target_names=['Dep','ND']))
"""
print('feature selection\n')
lr = LogisticRegression(C=1, penalty='l1').fit(X_train, y_train)
model = SelectFromModel(lr, prefit=True)
X_new = model.transform(X_train)
print(X_new.shape)

#print(X_new.get_feature_names())
X_test_1= model.transform(X_test)
#print("X_test:\n{}".format(repr(X_test_1)))

predscore = mnNB.fit(X_new,y_train).score(X_test_1,y_test)
print('NB Test {:.3f}'.format(predscore))

predscore = LinearSVC().fit(X_new,y_train).score(X_test_1,y_test)
print("SVM Test score: {:.3f}".format(predscore))

predscore = LogisticRegression().fit(X_new, y_train).score(X_test_1,y_test)
print("LR Test score: {:.3f}".format(predscore))

""" 

pca = TruncatedSVD(n_components=1000, random_state=0).fit(X_train)
#logistic = LogisticRegression()
# pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)])
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
print("X_train_pca.shape: {}".format(X_train_pca.shape))



predscore = GaussianNB().fit(X_train_pca,y_train).score(X_test_pca,y_test)
print('GaussianNB Test {:.3f}'.format(predscore))

print(classification_report(y_test, predict, target_names=['Dep','ND']))
# print(confusion_matrix(y_test, y_pred, labels=range(n_classes)))

predict = LinearSVC().fit(X_train_pca,y_train).predict(X_test_pca)
predscore = LinearSVC().fit(X_train_pca,y_train).score(X_test_pca,y_test)
print("SVM Test score: {:.3f}".format(predscore))
print(classification_report(y_test, predict, target_names=['Dep','ND']))

predict = LogisticRegression().fit(X_train_pca,y_train).predict(X_test_pca)
predscore = LogisticRegression().fit(X_train_pca,y_train).score(X_test_pca,y_test)
print("LR Test score: {:.3f}".format(predscore))
print(classification_report(y_test, predict, target_names=['Dep','ND']))


ValueError: X has 386272 features per sample; expecting 878545

In [None]:
from nltk.parse.stanford import StanfordDependencyParser
