# Import Libraries

In [None]:
%matplotlib inline 
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.dummy import DummyClassifier

# Classification using SVM- Fold 1

* This code runs SVM with TF-IDF model for classification of keyphrases
    
* The training and testing for each of the phases is seperated.  

* This can be modified using line ```traindata=pd.read_csv("../data/test1.tsv",delimiter="\t")``` and ```traindata=pd.read_csv("../data/train1.tsv",delimiter="\t")```

* Also to run for SVM with Bag of words use ```clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))), ('clf', SGDClassifier(loss='log', penalty='l2',alpha=0.001, max_iter=100, random_state=42,class_weight="balanced",warm_start=True))])```


In [None]:

#Point to train and test csv files with sentences and labels.
traindata=pd.read_csv("../data/test1.tsv",delimiter="\t")
X_test=traindata.text.values
Y_test=np.array(traindata.label.values).astype(np.int32)

traindata=pd.read_csv("../data/train1.tsv",delimiter="\t")
X_train=traindata.text.values
Y_train=np.array(traindata.label.values).astype(np.int32)

#SVM using gradient descent classifier with Hingeloss and optimal parameter search
clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))), ('tfidf', TfidfTransformer(use_idf=True)),
                         ('clf', SGDClassifier(loss='log', penalty='l2',alpha=0.001, max_iter=100, random_state=42,class_weight="balanced",warm_start=True))])

clf.fit(X_train,Y_train)
ypred=clf.predict(X_test)

print(classification_report(Y_test,ypred,digits=5))



# Classification Fold 2

* This code runs SVM with TF-IDF model for classification of keyphrases
    
* The training and testing for each of the phases is seperated.  

* This can be modified using line ```traindata=pd.read_csv("../data/test1.tsv",delimiter="\t")``` and ```traindata=pd.read_csv("../data/train1.tsv",delimiter="\t")```

* Also to run for SVM with Bag of words use ```clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))), ('clf', SGDClassifier(loss='log', penalty='l2',alpha=0.001, max_iter=100, random_state=42,class_weight="balanced",warm_start=True))])```


In [None]:
#Point to train and test csv files with sentences and labels.

traindata=pd.read_csv("../data/train2.tsv",delimiter="\t")
X_train=traindata.text.values
Y_train=np.array(traindata.label.values).astype(np.int32)

traindata=pd.read_csv("../data/test2.tsv",delimiter="\t")
X_test=traindata.text.values
Y_test=np.array(traindata.label.values).astype(np.int32)

#SVM using gradient descent classifier with Hingeloss and optimal parameter search
clf2 = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))), ('tfidf', TfidfTransformer(use_idf=True)),
                         ('clf', SGDClassifier(loss='log', penalty='l2',alpha=0.001, max_iter=100, random_state=42,class_weight="balanced",warm_start=True))])

clf2.fit(X_train,Y_train)
ypred=clf2.predict(X_test)

print(classification_report(Y_test,ypred,digits=5))



