In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt 
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
import numpy as np

In [55]:
def Logistic_Regression(X_train,X_test,y_train,y_test):
    logreg = LogisticRegression(penalty='l1', random_state=42)
    logreg.fit(X_train, y_train)
    y_pred_train = logreg.predict(X_train)
    y_pred = logreg.predict(X_test)
    print("LR train accuracy",metrics.accuracy_score(y_train, y_pred_train))
    print("LR test accuracy",metrics.accuracy_score(y_test, y_pred))

In [56]:
def Decision_Trees(X_train,X_test,y_train,y_test):
    
    models = [DecisionTreeClassifier(random_state=42), 
          DecisionTreeClassifier(random_state=42, max_depth=10), 
          DecisionTreeClassifier(random_state=42, min_samples_leaf=2), 
          DecisionTreeClassifier(random_state=42, min_samples_split=3)]
    for i in range(len(models)):
        dectree = models[i]

        dectree.fit(X_train, y_train)
        
        y_pred_train =  dectree.predict(X_train)
        y_pred_test = dectree.predict(X_test)
        print_key = ''

        print(print_key,",Train accuracy",metrics.accuracy_score(X_train, y_pred_train))
        print(print_key,",Test accuracy",metrics.accuracy_score(X_test, y_pred_test))

In [57]:
#load 20NG
twentyng_data = fetch_20newsgroups(subset='train',remove=('headers','footers','quotes'))
vectorizer = TfidfVectorizer(stop_words='english',max_df=0.95,min_df=2)
twentyng_data_vectors = vectorizer.fit_transform(twentyng_data.data)
twentyng_target = twentyng_data.target

In [58]:

tNGX_train, tNGX_test, tNGy_train, tNGy_test = train_test_split(twentyng_data_vectors, twentyng_target, test_size=0.33, random_state=42)
print("Twenty NG data")
print("Top 30 features for Decision Tree Classifier")
models = [DecisionTreeClassifier(random_state=42), 
          DecisionTreeClassifier(random_state=42, max_depth=10), 
          DecisionTreeClassifier(random_state=42, min_samples_leaf=2), 
          DecisionTreeClassifier(random_state=42, min_samples_split=3)]
for i in range(len(models)):
    dectree = models[i]
    
    dectree.fit(tNGX_train, tNGy_train)
    
    if(i == 0): print_key = "DT classifier:"
    elif(i == 1): print_key = "DT max depth = 10:"
    elif(i == 2): print_key = "DT min leafs = 2:"
    elif(i == 3): print_key = "DT min splits = 3:"
    print("label:",print_key)
    
    y_pred_train =  dectree.predict(tNGX_train)
    y_pred_test = dectree.predict(tNGX_test)
    
    
    print(print_key,",Train accuracy",metrics.accuracy_score(tNGy_train, y_pred_train))
    print(print_key,",Test accuracy",metrics.accuracy_score(tNGy_test, y_pred_test))



Twenty NG data
Top 30 features for Decision Tree Classifier
label: DT classifier:
DT classifier: ,Train accuracy 0.9720316622691293
DT classifier: ,Test accuracy 0.4833958221746117
label: DT max depth = 10:
DT max depth = 10: ,Train accuracy 0.24459102902374671
DT max depth = 10: ,Test accuracy 0.22656668452062131
label: DT min leafs = 2:
DT min leafs = 2: ,Train accuracy 0.8424802110817942
DT min leafs = 2: ,Test accuracy 0.47455811462238884
label: DT min splits = 3:
DT min splits = 3: ,Train accuracy 0.9411609498680739
DT min splits = 3: ,Test accuracy 0.4724156400642742


In [59]:
Logistic_Regression(tNGX_train, tNGX_test, tNGy_train, tNGy_test)


LR train accuracy 0.7178100263852243
LR test accuracy 0.6349758971612212


In [60]:
model = LogisticRegression(penalty='l1', C=1.0, random_state=42)
model.fit(tNGX_train, tNGy_train)
importance = model.coef_
features = vectorizer.get_feature_names()
final_features = {}
final_k_features = []

for i in range(len(importance)):
        top_k_features = np.argsort(-importance[i])
        
        for j in range(200):
            final_features[features[top_k_features[j]]] =  final_features.get(features[top_k_features[j]],0) + importance[i][top_k_features[j]]
        
#print(final_features)              
final_features = sorted(final_features.items(), key=lambda t:t[1],reverse=True)[:200]
print(final_features)
for each_item in final_features:
    final_k_features.append(each_item[0])
print("new:",final_k_features)
print(len(final_k_features))
# Convert train and test data to vectors based on k featuresv
twentyng_data = fetch_20newsgroups(subset='train',remove=('headers','footers','quotes'))
vectorizer = TfidfVectorizer(stop_words='english',max_df=0.95,min_df=2,vocabulary = final_k_features)
twentyng_data_vectors = vectorizer.fit_transform(twentyng_data.data)
twentyng_target = twentyng_data.target
tNGX_train, tNGX_test, tNGy_train, tNGy_test = train_test_split(twentyng_data_vectors, twentyng_target, test_size=0.33, random_state=42)
Logistic_Regression(tNGX_train,tNGX_test,tNGy_train,tNGy_test)        


[('bike', 26.7757790754995), ('sale', 22.88435147023588), ('hockey', 22.491313796575604), ('dod', 21.869254913502587), ('encryption', 21.359571335814213), ('israeli', 20.860489760631967), ('space', 20.72374390448121), ('armenians', 18.198732938309558), ('nsa', 17.746875885188608), ('motif', 17.582710786541842), ('car', 17.575584589413808), ('circuit', 17.314821454743353), ('gun', 17.131892969154254), ('clipper', 17.017940530365347), ('arab', 16.90183053457361), ('ride', 16.878185048388954), ('firearms', 16.777615566847896), ('israel', 16.66173360694052), ('orbit', 16.603370789434738), ('launch', 15.989835410837477), ('windows', 15.845205302346585), ('privacy', 15.722521954505156), ('x11r5', 15.613898080718567), ('church', 15.484488228120325), ('bikes', 15.357458865031331), ('god', 15.249967790391867), ('graphics', 14.916384968300786), ('apple', 14.88417008277051), ('government', 14.813439087434508), ('widget', 14.579158366955088), ('offer', 14.445103212944131), ('doctor', 14.3454381772

In [61]:
print("Twenty NG data")
print("Top 30 features for Decision Tree Classifier")
models = [DecisionTreeClassifier(random_state=42), 
          DecisionTreeClassifier(random_state=42, max_depth=10), 
          DecisionTreeClassifier(random_state=42, min_samples_leaf=2), 
          DecisionTreeClassifier(random_state=42, min_samples_split=3)]
for i in range(len(models)):
    dectree = models[i]
    
    dectree.fit(tNGX_train, tNGy_train)
    
    if(i == 0): print_key = "DT classifier:"
    elif(i == 1): print_key = "DT max depth = 10:"
    elif(i == 2): print_key = "DT min leafs = 2:"
    elif(i == 3): print_key = "DT min splits = 3:"
    print("label:",print_key)
    
    y_pred_train =  dectree.predict(tNGX_train)
    y_pred_test = dectree.predict(tNGX_test)
    
    
    print(print_key,",Train accuracy",metrics.accuracy_score(tNGy_train, y_pred_train))
    print(print_key,",Test accuracy",metrics.accuracy_score(tNGy_test, y_pred_test))

Twenty NG data
Top 30 features for Decision Tree Classifier
label: DT classifier:
DT classifier: ,Train accuracy 0.683641160949868
DT classifier: ,Test accuracy 0.48580610605249064
label: DT max depth = 10:
DT max depth = 10: ,Train accuracy 0.2399736147757256
DT max depth = 10: ,Test accuracy 0.2292447777182646
label: DT min leafs = 2:
DT min leafs = 2: ,Train accuracy 0.6187335092348285
DT min leafs = 2: ,Test accuracy 0.4595607927155865
label: DT min splits = 3:
DT min splits = 3: ,Train accuracy 0.6745382585751979
DT min splits = 3: ,Test accuracy 0.483663631494376
