In [1]:
from bs4 import BeautifulSoup
from nltk.tokenize import TweetTokenizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn import svm
import pandas as pd

In [2]:
def parse_data(path):
    # load data
    train = open(path).read()
    train = BeautifulSoup(train, 'lxml')
    tweets = [twt.text for twt in train.findAll('content')]
    labels = [labs.text for labs in train.findAll('value')]
    ids = [labs.text for labs in train.findAll('tweetid')] 
    return tweets, labels, ids


In [28]:
def parse_data_test(path):
    # load data
    train = open(path).read()
    train = BeautifulSoup(train, 'lxml')
    tweets = [twt.text for twt in train.findAll('content')]
    ids = [labs.text for labs in train.findAll('tweetid')] 
    return tweets, ids

In [29]:
tuitTrain,polTrain,idTrain =parse_data('TASS2017_T1_training.xml')
tuitDev,polDev,idDev =parse_data('TASS2017_T1_development.xml')
tuitTest,idTest =  parse_data_test("TASS2017_T1_test.xml")

In [30]:
tokenizer = TweetTokenizer(strip_handles=False, reduce_len=True, preserve_case=False)
tuitTrainClean = list(map(" ".join,map(tokenizer.tokenize,tuitTrain)))
tuitDevClean = list(map(" ".join,map(tokenizer.tokenize,tuitDev)))
tuitTestClean = list(map(" ".join,map(tokenizer.tokenize,tuitTest)))

In [20]:
pipe = Pipeline([
        ("tfidf",TfidfVectorizer()),
        ("clf",LogisticRegression())
])
    
parameters = {"tfidf__ngram_range" : [(1,2),(1,3),(2,3),(3,4),(3,5),(3,6),(4,5)]
                  ,"tfidf__max_df":[0.3,0.4,0.5,0.6,0.7,0.8,0.9],
                  "tfidf__min_df":[1,2,3,5],
                  "tfidf__analyzer":["char_wb"],
                  #'clf__kernel':['linear', 'rbf'],
                  'clf__C':[1,10,100,1000,10000]}
clf = GridSearchCV(pipe, parameters,cv=5,n_jobs=-1,verbose=2,scoring="f1_macro")
clf.fit(tuitTrainClean, polTrain)

Fitting 5 folds for each of 980 candidates, totalling 4900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:    8.7s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:   19.1s
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:   33.6s
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:   57.1s
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 1969 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 3265 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 4885 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 4900 out of 4900 | elapsed:  5.4min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                         

In [21]:
clf.best_score_

0.3978413396036023

In [22]:
clf.best_params_

{'clf__C': 10,
 'tfidf__analyzer': 'char_wb',
 'tfidf__max_df': 0.3,
 'tfidf__min_df': 2,
 'tfidf__ngram_range': (4, 5)}

In [32]:
pipe = Pipeline([
        ("tfidf",TfidfVectorizer(max_df=0.4,min_df=2,ngram_range=(4,5),analyzer="char_wb")),
        ("clf",svm.SVC(C=10000,kernel="rbf"))
])
pipe.fit(tuitTrainClean, polTrain)
prediction = pipe.predict(tuitTestClean)



In [36]:
with open("svm_marcos.txt","w") as f:
    for id, pred in zip(idTest,prediction):
        f.write("%s\t%s\n" %(id,pred))

In [27]:
from sklearn.metrics import classification_report
print(classification_report(polDev, prediction))

              precision    recall  f1-score   support

           N       0.58      0.80      0.67       219
         NEU       0.21      0.07      0.11        69
        NONE       0.48      0.21      0.29        62
           P       0.64      0.62      0.63       156

    accuracy                           0.57       506
   macro avg       0.48      0.42      0.42       506
weighted avg       0.53      0.57      0.53       506



In [12]:
from collections import Counter
Counter(polTrain)

Counter({'NONE': 139, 'N': 418, 'P': 318, 'NEU': 133})