In [1]:
import bz2
import pandas as pd
from sklearn.preprocessing import normalize
import numpy as np
from sklearn.metrics import precision_recall_fscore_support
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import h5py
import csv


In [2]:
baseList = []

## COVTYPE

In [3]:
from sklearn.datasets import fetch_covtype
cov_type = fetch_covtype()
X=normalize(cov_type.data, norm="l1")
y=cov_type.target
# idx = np.random.randint(0,X.shape[0],X.shape[0])

In [4]:
train, test, train_labels, test_labels = train_test_split(X, y, test_size=0.7, random_state=1)
c = 1
# rf = RandomForestClassifier()
rf = LogisticRegression(max_iter=1200, solver='lbfgs', C=c, multi_class='multinomial')
rf.fit(train, train_labels)
rf_pred = rf.predict(test)
# rf_acc = accuracy_score(test_labels, rf_pred)
tr_acc = precision_recall_fscore_support(test_labels, rf_pred, average='weighted')
# rf_acc = confusion_matrix(test_labels, rf_pred)

baseList.append(['covtype',tr_acc[:-1]])
print(tr_acc)
print("Training score: precision {}, recall {}, F1 {}, support {}".format(tr_acc[0],tr_acc[1],tr_acc[2],tr_acc[3]) )

(0.5529311549081724, 0.6008546651291218, 0.5605229342088506, None)
Training score: precision 0.5529311549081724, recall 0.6008546651291218, F1 0.5605229342088506, support None


  _warn_prf(average, modifier, msg_start, len(result))


## Sensorless

In [5]:
data = np.loadtxt('../data/SensorlessDriveDiagnosis/Sensorless_drive_diagnosis.txt')
np.random.shuffle(data) 
X = data[:,:-1]
X=normalize(X, norm="l1")
y = data[:,-1]

In [6]:
train, test, train_labels, test_labels = train_test_split(X, y, test_size=0.7, random_state=1)
c = 1
# rf = RandomForestClassifier()
rf = LogisticRegression(max_iter=1200, solver='lbfgs', C=c, multi_class='multinomial')
rf.fit(train, train_labels)
rf_pred = rf.predict(test)
# rf_acc = accuracy_score(test_labels, rf_pred)
tr_acc = precision_recall_fscore_support(test_labels, rf_pred, average='weighted')
# rf_acc = confusion_matrix(test_labels, rf_pred)

baseList.append(['sensorless',tr_acc[:-1]])
print(tr_acc)
print("Training score: precision {}, recall {}, F1 {}, support {}".format(tr_acc[0],tr_acc[1],tr_acc[2],tr_acc[3]) )


(0.16562053992993628, 0.16324437825036014, 0.15433110850462872, None)
Training score: precision 0.16562053992993628, recall 0.16324437825036014, F1 0.15433110850462872, support None


## Tuandromd

In [7]:
data = pd.read_csv('../data/TUANDROMD/TUANDROMD.csv')
data = data.dropna(how='any',axis=0)
data = data.values
X = data[:,:-1]
X=normalize(X, norm="l1")
y = data[:,-1]

In [8]:
train, test, train_labels, test_labels = train_test_split(X, y, test_size=0.7, random_state=1)
c = 1
# rf = RandomForestClassifier()
rf = LogisticRegression(max_iter=1200, solver='lbfgs', C=c, multi_class='multinomial')
rf.fit(train, train_labels)
rf_pred = rf.predict(test)
# rf_acc = accuracy_score(test_labels, rf_pred)
tr_acc = precision_recall_fscore_support(test_labels, rf_pred, average='weighted')
# rf_acc = confusion_matrix(test_labels, rf_pred)

baseList.append(['tuandromd',tr_acc[:-1]])
print(tr_acc)
print("Training score: precision {}, recall {}, F1 {}, support {}".format(tr_acc[0],tr_acc[1],tr_acc[2],tr_acc[3]) )



(0.9431724763705104, 0.94176, 0.9386746703123356, None)
Training score: precision 0.9431724763705104, recall 0.94176, F1 0.9386746703123356, support None


## syn

In [9]:
f = h5py.File('../data/syn/syn.hdf5', 'r')
inx = np.arange(f['labels'].shape[0])
np.random.shuffle(inx)
       
X = f['features'][:].T[inx][:-100]
y = f['labels'][:][inx][:-100]

In [10]:
train, test, train_labels, test_labels = train_test_split(X, y, test_size=0.7, random_state=1)
c = 0.0001
# rf = RandomForestClassifier()
rf = LogisticRegression(max_iter=1200, solver='lbfgs', C=c, multi_class='multinomial')
rf.fit(train, train_labels)
rf_pred = rf.predict(test)
# rf_acc = accuracy_score(test_labels, rf_pred)
tr_acc = precision_recall_fscore_support(test_labels, rf_pred, average='weighted')
# rf_acc = confusion_matrix(test_labels, rf_pred)

baseList.append(['syn',tr_acc[:-1]])
print(tr_acc)
print("Training score: precision {}, recall {}, F1 {}, support {}".format(tr_acc[0],tr_acc[1],tr_acc[2],tr_acc[3]) )

(0.8603006598764303, 0.8605496707701117, 0.859230443046017, None)
Training score: precision 0.8603006598764303, recall 0.8605496707701117, F1 0.859230443046017, support None


## Blog

In [11]:
X = np.load("../data/blog/xtrain.npy")
y = np.load("../data/blog/ytrain.npy")

In [12]:
train, test, train_labels, test_labels = train_test_split(X, y, test_size=0.7, random_state=1)
c = 0.001
# rf = RandomForestClassifier()
rf = LogisticRegression(max_iter=1200, solver='lbfgs', C=c, multi_class='multinomial')
rf.fit(train, train_labels)
rf_pred = rf.predict(test)
# rf_acc = accuracy_score(test_labels, rf_pred)
tr_acc = precision_recall_fscore_support(test_labels, rf_pred, average='weighted')
# rf_acc = confusion_matrix(test_labels, rf_pred)

baseList.append(['blog',tr_acc[:-1]])
print(tr_acc)
print("Training score: precision {}, recall {}, F1 {}, support {}".format(tr_acc[0],tr_acc[1],tr_acc[2],tr_acc[3]) )

(0.7926394892970967, 0.7957085991602596, 0.7913803086325824, None)
Training score: precision 0.7926394892970967, recall 0.7957085991602596, F1 0.7913803086325824, support None


## Income

In [13]:
X = np.load("../data/income/train_feat_std.npy")
y = np.load("../data/income/train_label.npy")

In [14]:
train, test, train_labels, test_labels = train_test_split(X, y, test_size=0.7, random_state=1)
c = 0.001
# rf = RandomForestClassifier()
rf = LogisticRegression(max_iter=1200, solver='lbfgs', C=c, multi_class='multinomial')
rf.fit(train, train_labels)
rf_pred = rf.predict(test)
# rf_acc = accuracy_score(test_labels, rf_pred)
tr_acc = precision_recall_fscore_support(test_labels, rf_pred, average='weighted')
# rf_acc = confusion_matrix(test_labels, rf_pred)

baseList.append(['income',tr_acc[:-1]])
print(tr_acc)
print("Training score: precision {}, recall {}, F1 {}, support {}".format(tr_acc[0],tr_acc[1],tr_acc[2],tr_acc[3]) )

(0.820244391751737, 0.8280761579994317, 0.8105013322099798, None)
Training score: precision 0.820244391751737, recall 0.8280761579994317, F1 0.8105013322099798, support None


## export

In [15]:
fields = ['dataset', 'test_acc'] 
   
# data rows of csv file 

with open('base.csv', 'w') as f:
     
    # using csv.writer method from CSV package
    write = csv.writer(f)
     
    write.writerow(fields)
    write.writerows(baseList)