In [1]:
import os
import numpy as np 
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import math
import helpers.data_mining_helpers as dmh

%matplotlib inline

In [2]:
# plz setting your own relative data_path for trainning set
# plz import the necessary file by your own
# or just comment out the part you don't need to import
dir_data = 'data_set'

f_train_set = os.path.join(dir_data, 'train_set.csv')
# f_test_set = os.path.join(dir_data, 'test_set.csv')
# f_public_test_split = os.path.join(dir_data, 'public_test_split.csv')
# f_test_submission = os.path.join(dir_data, 'task1_sample_submission.csv')

# read file and convert into pandas dataframe
train_set = pd.read_csv(f_train_set)
# test_set = pd.read_csv(f_test_set)
# public_test_split = pd.read_csv(f_public_test_split)
# test_submission = pd.read_csv(f_test_submission)

In [3]:
TASKS = ['BACKGROUND', 'OBJECTIVES', 'METHODS', 'RESULTS', 'CONCLUSIONS', 'OTHERS']

In [4]:
train_set.head()

Unnamed: 0,Sentence,Task,unigrams,processed_sen,unigrams_no_stop_words,processed_sen_no_stop,Doc_no.,Num_of_sentences,Rank,Rank%,Is_first,Is_last
0,5G millimeter wave (mmWave) technology is envi...,BACKGROUND,"['5g', 'millimet', 'wave', '(', 'mmwave', ')',...",5g millimet wave ( mmwave ) technolog is envis...,"['5g', 'millimet', 'wave', '(', 'mmwave', ')',...",5g millimet wave ( mmwave ) technolog envis in...,6137,5,0,0.0,1,0
1,The reliability of mmWave links may be comprom...,BACKGROUND,"['the', 'reliabl', 'of', 'mmwave', 'link', 'ma...",the reliabl of mmwave link may be compromis du...,"['reliabl', 'mmwave', 'link', 'may', 'compromi...",reliabl mmwave link may compromis due difficul...,6137,5,1,0.2,0,0
2,"To address such challenges, out-of-band inform...",OBJECTIVES,"['to', 'address', 'such', 'challeng', ',', 'ou...","to address such challeng , out-of-band inform ...","['address', 'challeng', ',', 'out-of-band', 'i...","address challeng , out-of-band inform sub-6 gh...",6137,5,2,0.4,0,0
3,"In this paper, we use ray tracing simulations ...",METHODS,"['in', 'thi', 'paper', ',', 'we', 'use', 'ray'...","in thi paper , we use ray trace simul to chara...","['paper', ',', 'use', 'ray', 'trace', 'simul',...","paper , use ray trace simul character angular ...",6137,5,3,0.6,0,0
4,Our results shed light on increasing sparsity ...,RESULTS/CONCLUSIONS,"['our', 'result', 'shed', 'light', 'on', 'incr...",our result shed light on increas sparsiti beha...,"['result', 'shed', 'light', 'increas', 'sparsi...",result shed light increas sparsiti behavior pr...,6137,5,4,0.8,0,1


In [5]:
train_set['label'] = train_set['Task'].apply(lambda t : t.split('/'))

In [6]:
COL_TO_CONVERT = 'processed_sen'
# COL_TO_CONVERT = 'processed_sen_no_stop'

FEATURES = 30000

In [7]:
count_vect, data_count = dmh.get_count_vect(train_set[COL_TO_CONVERT])
data_count.shape

(37409, 17045)

In [8]:
tfidf_vect, data_tfidf = dmh.get_tfidf_vect(train_set[COL_TO_CONVERT])
data_tfidf.shape

(37409, 17045)

In [9]:
bi_count_vect, bi_data_count = dmh.get_count_vect \
    (train_set[COL_TO_CONVERT], max_features=FEATURES, ngram_range=(2,2))
bi_data_count.shape

(37409, 30000)

In [10]:
bi_tfidf_vect, bi_data_tfidf = dmh.get_tfidf_vect \
    (train_set[COL_TO_CONVERT], max_features=FEATURES, ngram_range=(2,2))
bi_data_tfidf.shape

(37409, 30000)

In [11]:
# from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import StratifiedKFold

In [12]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(train_set['label'])

mlb.classes_

array(['BACKGROUND', 'CONCLUSIONS', 'METHODS', 'OBJECTIVES', 'OTHERS',
       'RESULTS'], dtype=object)

In [13]:
from scipy.sparse import csr_matrix, vstack, hstack

In [14]:
df = train_set
N = len(df)
feature_names = ['Num_of_sentences', 'Rank', 'Rank%', 'Is_first', 'Is_last']
features = []

for i in feature_names:
    row = np.arange(N)
    col = np.zeros(N)
    data = df[i].values
    f = csr_matrix((data, (row, col)), shape=(N, 1))
    features.append(f)

In [15]:
# cross value score on naive bayes classifier

X = hstack([data_count, bi_data_count])
# X = hstack([data_tfidf, bi_data_tfidf])
for f in features:
    X = hstack([X, f])

y = train_set['Task']

clf = MultinomialNB()
cross_val_score(clf, X, y, cv=10, scoring='f1_micro').mean()



0.5484590044171107

In [16]:
# cross value score on decision tree classifier

X = hstack([data_count, bi_data_count])
# X = hstack([data_tfidf, bi_data_tfidf])
for f in features:
    X = hstack([X, f])
y = y_train

clf = DecisionTreeClassifier(random_state=0)
cross_val_score(clf, X, y, cv=10, scoring='f1_micro').mean()

0.5490778464456165

---

---

### Model Building

In [66]:
# Build naive bayes classifier

X = hstack([data_count, bi_data_count])
# X = hstack([data_tfidf, bi_data_tfidf])
for f in features:
    X = hstack([X, f])

y = train_set['Task']

clf = MultinomialNB()
clf.fit(X, y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [51]:
# Build decision tree classifier

X = hstack([data_count, bi_data_count])
# X = hstack([data_tfidf, bi_data_tfidf])
for f in features:
    X = hstack([X, f])
    
y = y_train

clf = DecisionTreeClassifier(random_state=0)
clf.fit(X, y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=0, splitter='best')

### transforming testing data

In [54]:
public_test_split.head()

Unnamed: 0,Sentence,unigrams,processed_sen,unigrams_no_stop_words,processed_sen_no_stop,Doc_no.,Num_of_sentences,Rank,Rank%,Is_first,Is_last
0,Mobile Crowdsensing is a promising paradigm fo...,"['mobil', 'crowdsens', 'is', 'a', 'promis', 'p...",mobil crowdsens is a promis paradigm for ubiqu...,"['mobil', 'crowdsens', 'promis', 'paradigm', '...","mobil crowdsens promis paradigm ubiquit sens ,...",0,7,0,0.0,1,0
1,As a fundamental property of Mobile Crowdsensi...,"['as', 'a', 'fundament', 'properti', 'of', 'mo...",as a fundament properti of mobil crowdsens sys...,"['fundament', 'properti', 'mobil', 'crowdsens'...","fundament properti mobil crowdsens system , te...",0,7,1,0.142857,0,0
2,"Therefore, a mechanism is required for the sys...","['therefor', ',', 'a', 'mechan', 'is', 'requir...","therefor , a mechan is requir for the system s...","['therefor', ',', 'mechan', 'requir', 'system'...","therefor , mechan requir system server recruit...",0,7,2,0.285714,0,0
3,"In this paper, we develop a novel Cheating-Res...","['in', 'thi', 'paper', ',', 'we', 'develop', '...","in thi paper , we develop a novel cheating-res...","['paper', ',', 'develop', 'novel', 'cheating-r...","paper , develop novel cheating-resili incent (...",0,7,3,0.428571,0,0
4,"Via theoretical analysis, we demonstrate the c...","['via', 'theoret', 'analysi', ',', 'we', 'demo...","via theoret analysi , we demonstr the correct ...","['via', 'theoret', 'analysi', ',', 'demonstr',...","via theoret analysi , demonstr correct design .",0,7,4,0.571429,0,0


In [56]:
df = public_test_split

COL_TO_CONVERT = 'processed_sen'
# COL_TO_CONVERT = 'processed_sen_no_stop'

In [57]:
test_count = count_vect.transform(df[COL_TO_CONVERT])
test_count.shape

(131166, 19127)

In [58]:
test_tfidf = tfidf_vect.transform(df[COL_TO_CONVERT])
test_tfidf.shape

(131166, 19127)

In [59]:
bi_test_count = bi_count_vect.transform(df[COL_TO_CONVERT])
bi_test_count.shape

(131166, 30000)

In [60]:
bi_test_tfidf = bi_tfidf_vect.transform(df[COL_TO_CONVERT])
bi_test_tfidf.shape

(131166, 30000)

In [61]:
df = public_test_split
N = len(df)
feature_names = ['Num_of_sentences', 'Rank', 'Rank%', 'Is_first', 'Is_last']
features = []

for i in feature_names:
    row = np.arange(N)
    col = np.zeros(N)
    data = df[i].values
    f = csr_matrix((data, (row, col)), shape=(N, 1))
    features.append(f)

len(features)

5

In [62]:
X_test = hstack([test_count, bi_test_count])
# X_test = hstack([test_tfidf, bi_test_tfidf])
for f in features:
    X_test = hstack([X_test, f])

In [67]:
y_pred = clf.predict(X_test)
len(y_pred)

131166

In [73]:
y_pred[:5]

array(['BACKGROUND', 'BACKGROUND', 'BACKGROUND', 'OBJECTIVES', 'RESULTS'],
      dtype='<U49')

In [69]:
mlb.classes_

array(['BACKGROUND', 'CONCLUSIONS', 'METHODS', 'OBJECTIVES', 'OTHERS',
       'RESULTS'], dtype=object)

In [22]:
for i, y in enumerate(y_pred):
    test_submission.at[i, 'BACKGROUND'] = y[0]
    test_submission.at[i, 'OBJECTIVES'] = y[3]
    test_submission.at[i, 'METHODS'] = y[2]
    test_submission.at[i, 'RESULTS'] = y[5]
    test_submission.at[i, 'CONCLUSIONS'] = y[1]
    test_submission.at[i, 'OTHERS'] = y[4]

In [74]:
for i, p in enumerate(y_pred):
    categories = p.split('/')
    
    for c in categories:
        test_submission.at[i, c] = 1

In [75]:
test_submission.to_csv('test_submission.csv', index=False)
test_submission.head()

Unnamed: 0,order_id,BACKGROUND,OBJECTIVES,METHODS,RESULTS,CONCLUSIONS,OTHERS
0,T00001_S001,1,0,0,0,0,0
1,T00001_S002,1,0,0,0,0,0
2,T00001_S003,1,0,0,0,0,0
3,T00001_S004,0,1,0,0,0,0
4,T00001_S005,0,0,0,1,0,0


In [78]:
from collections import defaultdict
d = defaultdict(int)

for i, p in enumerate(y_pred):
    categories = p.split('/')
    l = len(categories)
    
    d[l] += 1

d

defaultdict(int, {1: 130879, 2: 287})

### Issue for Naive Bayes Classifier
Hard to classify multi-class data