In [20]:
import numpy
import pandas
from sklearn.model_selection  import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
import sklearn.linear_model as lm
from nltk.corpus import stopwords
import nltk
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn import pipeline,metrics
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score,precision_score
from numpy import genfromtxt
import random

with open('sarcasm-dataset.txt','r') as fname:
    file_content = fname.readlines()
random.shuffle(file_content)

output = 'tweets' + "\t" + 'label' + "\n"

for tweet_content in file_content:
    
    tweet, label = tweet_content[:-3], tweet_content[-2]
    output += tweet + "\t" + label + "\n"
    
outputfile = open('dataset_csv.csv',"w")
outputfile.write(output)
outputfile.close() 

df = pandas.DataFrame()
train = pandas.DataFrame()
test = pandas.DataFrame()

df = pandas.read_csv("dataset_csv.csv", header=0, sep='\t')

train = df[0:1500]
test = df[1501:]

tfidf_vec = TfidfVectorizer(
            analyzer="word",max_features=None,
            token_pattern=r'\w{1,}',strip_accents='unicode',
            lowercase=True,ngram_range=(1,3),
            min_df=2,use_idf=True,
            smooth_idf=True,norm="l2",
            sublinear_tf=True)

x_train,x_test,y_train,y_test=train_test_split(train['tweets'],train['label'],test_size=0.4,random_state=2,stratify=train['label'])

train_tfidf_matrix=tfidf_vec.fit_transform(x_train)
test_tfidf_matrix=tfidf_vec.transform(x_test)

train = pandas.DataFrame(train_tfidf_matrix.toarray())
test = pandas.DataFrame(test_tfidf_matrix.toarray())

svd = TruncatedSVD(algorithm="randomized", random_state=None, tol=0.0)
scl = StandardScaler()
lr = lm.LogisticRegression(class_weight="balanced", tol = 0.0001)
clf = pipeline.Pipeline([('svd', svd),
    						 ('scl', scl),
                    	     ('lr', lr)])

param_grid = {'svd__n_components' : [200,250,300,350,400],
                 'svd__n_iter':[3,4,5],
                 'lr__C': [10,11,12,13,14,15,16,17],
                  'lr__penalty':["l1","l2"]}

f_scorer = metrics.make_scorer(f1_score, greater_is_better = True)

model = GridSearchCV(estimator = clf, param_grid=param_grid, scoring=f_scorer,
                                     verbose=10, n_jobs=-1, iid=True, refit=True, cv=10)

print(train.shape[0])
print(train.shape[1])

print(test.shape[0])
print(test.shape[1])


model.fit(train, y_train)
print("Best score: %0.3f" % model.best_score_)

print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
	print("\t%s: %r" % (param_name, best_parameters[param_name]))

best_model = model.best_estimator_
best_model.fit(train,y_train)
preds = best_model.predict(test)
preds=list(preds)
target_labels=list(y_test)
print(f1_score(target_labels,preds,average="weighted"))


900
3188
600
3188
Fitting 10 folds for each of 240 candidates, totalling 2400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    8.8s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:   12.7s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:   18.6s
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:   23.9s
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   30.9s
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   37.9s
[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed:   47.6s
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:   56.7s
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  94 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 109 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 141 tasks      | elapsed:  2

Best score: 0.901
Best parameters set:
	lr__C: 17
	lr__penalty: 'l1'
	svd__n_components: 350
	svd__n_iter: 3




0.9215783628514407
