# ANLYSIS SPANISH AIRLINESS
## Load packages

In [None]:
from bs4 import BeautifulSoup
import collections
from collections import defaultdict

import datetime

from itertools import chain

import matplotlib
import matplotlib.pyplot as plt

import nltk
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('wordnet')
from nltk import ngrams
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import WordPunctTokenizer, word_tokenize
import random
import numpy as np

import pandas as pd

from sklearn.metrics import accuracy_score

import re

import scipy.sparse as sp

try:
    import seaborn as sns
except:
    print('Seaborn must be installed to continue (pip install seaborn).\nIt provides an enhanced plotting experience', 
          file=sys.stderr)
    if input('Do you want me to do it for you? (y/n) ') == 'y':
        !pip install seaborn
    import seaborn as sns

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.linear_model import PassiveAggressiveClassifier, Perceptron, RidgeClassifier, SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier

import string

import sys

try:
    from wordcloud import WordCloud, STOPWORDS
except:
    print('For the next cells you need WordCloud (pip install wordcloud)', file=sys.stderr)
    if input('Do you want me to do it for you? (y/n) ') == 'y':
        !pip install wordcloud
        
    from wordcloud import WordCloud, STOPWORDS

## Function definition

In [None]:
def precleaning(tweet):

    stop_words = set(stopwords.words('spanish'))
    wordnet_lemmatizer = WordNetLemmatizer()
    pat1 = r'@[A-Za-z0-9_]+'
    pat2 = r'https?://[^ ]+'
    combined_pat = r'|'.join((pat1, pat2))
    www_pat = r'www.[^ ]+'

    soup = BeautifulSoup(tweet, 'lxml')
    souped = soup.get_text()
    try:
        bom_removed = souped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        bom_removed = souped
        
    stripped = re.sub(combined_pat, '', bom_removed)
    stripped = re.sub(www_pat, '', stripped)
    only_letters = re.sub("[^a-zA-Z]", " ",stripped) 
    tokens = nltk.word_tokenize(only_letters)[2:]
    lower_case = [l.lower() for l in tokens]
    
    filtered_result = list(filter(lambda l: l not in stop_words, lower_case))
    lemmas = [wordnet_lemmatizer.lemmatize(t) for t in filtered_result]
    
    return lemmas

def ngrams(input_list):
    
    bigrams = [' '.join(t) for t in list(zip(input_list, input_list[1:]))]
    trigrams = [' '.join(t) for t in list(zip(input_list, input_list[1:], input_list[2:]))]
    
    return bigrams+trigrams

def count_words(input):
    cnt = collections.Counter()
    for row in input:
        for word in row:
            cnt[word] += 1
            
    return cnt

def cleaning(tweet):
    
    tok = WordPunctTokenizer()
    pat1 = r'@[A-Za-z0-9]+'
    pat2 = r'https?://[A-Za-z0-9./]+'
    pat3=r'(\w+:\/\/\S+)'
    combined_pat = r'|'.join((pat1, pat2,pat3))

    soup = BeautifulSoup(tweet, 'lxml')
    souped = soup.get_text()
    stripped = re.sub(combined_pat, '', souped)
    try:
        clean = stripped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        clean = stripped
    letters_only = re.sub("[^a-zA-Z]", " ", clean)
    lower_case = letters_only.lower()
    # During the letters_only process two lines above, it has created unnecessay white spaces,
    # I will tokenize and join together to remove unneccessary white spaces
    words = tok.tokenize(lower_case)
    return (" ".join(words)).strip()

def tokenize(text):
    tokens = nltk.word_tokenize(text,language='spanish')
    stems = []
    for item in tokens:
        stems.append(nltk.PorterStemmer().stem(item))
    return stems

def obtain_data_representation(df, test=None):
    
    df['text_clean'] = np.array([cleaning(tweet) for tweet in df['text']])

    df.text_clean.head(10)
    
    # If there is no test data, split the input
    if test is None:
        # Divide data in train and test
        train, test = train_test_split(df, test_size=0.25)
        df.airline_sentiment = pd.Categorical(df.airline_sentiment)
    else:
        # Otherwise, all is train
        train = df
        test['text_clean'] = np.array([cleaning(tweet) for tweet in test['text'] ])
        (test.text_clean.head(10))  
      
    # Create a Bag of Words (BoW), by using train data only
    #cv = CountVectorizer(max_features=200,stop_words='english')
    cv = TfidfVectorizer(tokenizer=tokenize, max_df = 0.8, use_idf=True,min_df=1,)
    x_train = cv.fit_transform(train['text_clean'])
    y_train = train['airline_sentiment'].values
    
    # Obtain BoW for the test data, using the previously fitted one
    x_test = cv.transform(test['text_clean'])
    try:
        y_test = test['airline_sentiment'].values
    except:
        # It might be the submision file, where we don't have target values
        y_test = None
        
    return {
        'train': {
            'x': x_train,
            'y': y_train
        },
        'test': {
            'x': x_test,
            'y': y_test
        }
    }


def obtain_raw_data_representation(df, test=None):
    
    df['text_clean'] =  df['text']

    df.text_clean.head(10)
    
    # If there is no test data, split the input
    if test is None:
        # Divide data in train and test
        train, test = train_test_split(df, test_size=0.25)
        df.airline_sentiment = pd.Categorical(df.airline_sentiment)
    else:
        # Otherwise, all is train
        train = df
        test['text_clean'] = np.array(test['text'])
        (test.text_clean.head(10))  
      
    # Create a Bag of Words (BoW), by using train data only
    #cv = CountVectorizer(max_features=200,stop_words='english')
    cv = TfidfVectorizer(tokenizer=tokenize, max_df = 0.8, use_idf=True,min_df=1,)
    x_train = cv.fit_transform(train['text_clean'])
    y_train = train['airline_sentiment'].values
    
    # Obtain BoW for the test data, using the previously fitted one
    x_test = cv.transform(test['text_clean'])
    try:
        y_test = test['airline_sentiment'].values
    except:
        # It might be the submision file, where we don't have target values
        y_test = None
        
    return {
        'train': {
            'x': x_train,
            'y': y_train
        },
        'test': {
            'x': x_test,
            'y': y_test
        }
    }


### Model training

def train_model(dataset, dmodel, *model_args, **model_kwargs):
    model = dmodel(*model_args, **model_kwargs)
    model.fit(dataset['train']['x'], dataset['train']['y'])
    y_pred = model.predict(dataset['test']['x'])
    if dataset['test']['y'] is not None:
        score = accuracy_score(dataset['test']['y'], y_pred)
        print("Model score is: {}".format(score))

    return model, y_pred

def train_modelMNB(dataset):
    # Create a Multinomial Naive Bayes model
    model = MultinomialNB(alpha=0.5)  
    model.fit(dataset['train']['x'], dataset['train']['y'])
    y_pred = model.predict(dataset['test']['x'])
    if dataset['test']['y'] is not None:
        score = accuracy_score(dataset['test']['y'], y_pred)
        print("Model MNB score is: {}".format(score))
       
    return model, y_pred

def train_modelRC(dataset):
    # Create a Ridge Classifier model
    model = RidgeClassifier(tol=1e-2, solver="sag")
    model.fit(dataset['train']['x'], dataset['train']['y'])
    y_pred = model.predict(dataset['test']['x'])
    if dataset['test']['y'] is not None:
        score = accuracy_score(dataset['test']['y'], y_pred)
        print("Model RC score is: {}".format(score))

    return model, y_pred

def train_modelP(dataset):
    # Create a Perceptron model
    model = Perceptron(n_iter=50)
    model.fit(dataset['train']['x'], dataset['train']['y'])
    y_pred = model.predict(dataset['test']['x'])
    if dataset['test']['y'] is not None:
        score = accuracy_score(dataset['test']['y'], y_pred)
        print("Model P score is: {}".format(score))
      
    return model, y_pred

def train_modelPAC(dataset):
    # Create a Passive Aggressive Classifier model
    model = PassiveAggressiveClassifier(n_iter=50)
    model.fit(dataset['train']['x'], dataset['train']['y'])
    y_pred = model.predict(dataset['test']['x'])
    if dataset['test']['y'] is not None:
        score = accuracy_score(dataset['test']['y'], y_pred)
        print("Model PAC score is: {}".format(score))

    return model, y_pred

def train_modelRFC(dataset):
    # Create a Random Forest model
    model = RandomForestClassifier(n_estimators=30)
    model.fit(dataset['train']['x'], dataset['train']['y'])
    y_pred = model.predict(dataset['test']['x'])
    if dataset['test']['y'] is not None:
        score = accuracy_score(dataset['test']['y'], y_pred)
        print("Model RFC score is: {}".format(score))

    return model, y_pred


def train_modelRFC2(dataset):
    # Create a Random Forest model
    model = RandomForestClassifier(n_estimators=30, max_depth=None, min_samples_split=2, random_state=0)
    model.fit(dataset['train']['x'], dataset['train']['y'])
    y_pred = model.predict(dataset['test']['x'])
    if dataset['test']['y'] is not None:
        score = accuracy_score(dataset['test']['y'], y_pred)
        print("Model RFC score is: {}".format(score))

    return model, y_pred

def train_modelGBC(dataset):
    # Create a Gradient Boosting Classifier
    model = GradientBoostingClassifier(n_estimators=100)
    model.fit(dataset['train']['x'], dataset['train']['y'])
    y_pred = model.predict(dataset['test']['x'])
    if dataset['test']['y'] is not None:
        score = accuracy_score(dataset['test']['y'], y_pred)
        print("Model GBC2 score is: {}".format(score))

    return model, y_pred

def train_modelGBC2(dataset):
    # Create a Gradient Boosting Classifier
    model = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
                                       max_depth=1, random_state=0)
    model.fit(dataset['train']['x'], dataset['train']['y'])
    y_pred = model.predict(dataset['test']['x'])
    if dataset['test']['y'] is not None:
        score = accuracy_score(dataset['test']['y'], y_pred)
        print("Model GBC2 score is: {}".format(score))

    return model, y_pred

def train_modelETC(dataset):
    # Create a Extra Trees Classifier
    model = ExtraTreesClassifier(n_estimators=30)
    model.fit(dataset['train']['x'], dataset['train']['y'])
    y_pred = model.predict(dataset['test']['x'])
    if dataset['test']['y'] is not None:
        score = accuracy_score(dataset['test']['y'], y_pred)
        print("Model ETC score is: {}".format(score))

    return model, y_pred

def train_modelETC2(dataset):
    # Create a Extra Trees Classifier
    model = ExtraTreesClassifier(n_estimators=30, max_depth=None, min_samples_split=2, random_state=0)
    model.fit(dataset['train']['x'], dataset['train']['y'])
    y_pred = model.predict(dataset['test']['x'])
    if dataset['test']['y'] is not None:
        score = accuracy_score(dataset['test']['y'], y_pred)
        print("Model ETC2 score is: {}".format(score))

    return model, y_pred

def train_modelDTC(dataset):
    # Create a Decision Tree Classifier
    model = DecisionTreeClassifier()
    model.fit(dataset['train']['x'], dataset['train']['y'])
    y_pred = model.predict(dataset['test']['x'])
    if dataset['test']['y'] is not None:
        score = accuracy_score(dataset['test']['y'], y_pred)
        print("Model DTC score is: {}".format(score))

    return model, y_pred

def train_modelDTC2(dataset):
    # Create a Decision Tree Classifier
    model = DecisionTreeClassifier(max_depth=None, min_samples_split=2, random_state=0)
    model.fit(dataset['train']['x'], dataset['train']['y'])
    y_pred = model.predict(dataset['test']['x'])
    if dataset['test']['y'] is not None:
        score = accuracy_score(dataset['test']['y'], y_pred)
        print("Model DTC2 score is: {}".format(score))

    return model, y_pred

def train_modelSGD(dataset):
    # Create a Stochastic Gradient Descent model
    model = SGDClassifier(loss='hinge', alpha=0.0001, learning_rate='optimal')
    model.fit(dataset['train']['x'], dataset['train']['y'])
    y_pred = model.predict(dataset['test']['x'])
    if dataset['test']['y'] is not None:
        score = accuracy_score(dataset['test']['y'], y_pred)
        print("Model SGD score is: {}".format(score))
       
    return model, y_pred

def train_modelSVC(dataset):
    # Create a Support Vector Classification model
    model = SVC(C=1.0, kernel='rbf', class_weight= 'balanced')
    model.fit(dataset['train']['x'], dataset['train']['y'])
    y_pred = model.predict(dataset['test']['x'])
    if dataset['test']['y'] is not None:
        score = accuracy_score(dataset['test']['y'], y_pred)
        print("Model SVC score is: {}".format(score))

    return model, y_pred

def train_modelKNC(dataset):
    # Create a K-neighbors Classification model
    model = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30)
    model.fit(dataset['train']['x'], dataset['train']['y'])
    y_pred = model.predict(dataset['test']['x'])
    if dataset['test']['y'] is not None:
        score = accuracy_score(dataset['test']['y'], y_pred)
        print("Model KNC score is: {}".format(score))
       
    return model, y_pred

def train_modelNCC(dataset):
    # Create a nearest centroid Classification model
    model = NearestCentroid(metric='euclidean', shrink_threshold=None)
    model.fit(dataset['train']['x'], dataset['train']['y'])
    y_pred = model.predict(dataset['test']['x'])
    if dataset['test']['y'] is not None:
        score = accuracy_score(dataset['test']['y'], y_pred)
        print("Model NCC score is: {}".format(score))
       
    return model, y_pred

def train_modelGPC(dataset):
    # Create a Gaussian Process Classification model
    model = GaussianProcessClassifier(kernel=None, optimizer='fmin_l_bfgs_b')
    model.fit(dataset['train']['x'], dataset['train']['y'])
    y_pred = model.predict(dataset['test']['x'])
    if dataset['test']['y'] is not None:
        score = accuracy_score(dataset['test']['y'], y_pred)
        print("Model GPC score is: {}".format(score))
       
    return model, y_pred

def train_modelGaussianNB(dataset):
    # Create a Gaussian Naive Bayes Classification model
    model = GaussianNB(priors=None)
    model.fit(dataset['train']['x'], dataset['train']['y'])
    y_pred = model.predict(dataset['test']['x'])
    if dataset['test']['y'] is not None:
        score = accuracy_score(dataset['test']['y'], y_pred)
        print("Model GNB score is: {}".format(score))
        
    return model, y_pred

def train_modelDTC(dataset):
    # Create a Decision Trees Classification model
    model = DecisionTreeClassifier(criterion='gini', splitter='best', class_weight="balanced", presort=False)
    model.fit(dataset['train']['x'], dataset['train']['y'])
    y_pred = model.predict(dataset['test']['x'])
    if dataset['test']['y'] is not None:
        score = accuracy_score(dataset['test']['y'], y_pred)
        print("Model DTC score is: {}".format(score))
        
    return model, y_pred

def train_modelABC(dataset):
    # Create a Ada Boost Classification model
    model = AdaBoostClassifier(base_estimator=None, algorithm='SAMME.R')
    model.fit(dataset['train']['x'], dataset['train']['y'])
    y_pred = model.predict(dataset['test']['x'])
    if dataset['test']['y'] is not None:
        score = accuracy_score(dataset['test']['y'], y_pred)
        print("Model ABC score is: {}".format(score))
        
    return model, y_pred

def train_modelMLPC(dataset):
    # Create a Multi-layer perceptron Classification model
    model = MLPClassifier(hidden_layer_sizes=(100, ), activation='relu', solver='adam', alpha=0.0001, 
                          learning_rate = 'constant', learning_rate_init=0.001)
    model.fit(dataset['train']['x'], dataset['train']['y'])
    y_pred = model.predict(dataset['test']['x'])
    if dataset['test']['y'] is not None:
        score = accuracy_score(dataset['test']['y'], y_pred)
        print("Model MLPC score is: {}".format(score))
        
    return model, y_pred

### Submision

def create_submit_file(df_submission, ypred):
    date = datetime.datetime.now().strftime("%m_%d_%Y-%H_%M_%S")
    filename = 'submission_' + date + '.csv'
    
    df_submission['airline_sentiment'] = ypred
    df_submission[['airline_sentiment']].to_csv(filename)
    
    print('Submission file created: {}'.format(filename))
    print('Upload it to Kaggle InClass')
    
def build_bag_of_words_features_filtered(words):
    return {
        word:1 for word in words \
        if not word in useless_words}


## Pipeline
### Read dataset

In [None]:
%matplotlib inline

matplotlib.rcParams['figure.figsize'] = (20.0, 20.0)
matplotlib.rcParams['figure.dpi'] = 200
    
df = pd.read_csv('tweets_spanish.csv', encoding='utf-16', index_col='tweet_id', sep=',')
df.tweet_created = pd.to_datetime(df.tweet_created)
head=df[['text','airline_sentiment']]
head.head()

print("Number of tweets:", df.shape[0])

### NaN analysis

In [None]:
df[df.isnull().any(axis=1)].head()
np.sum(df.isnull().any(axis=1))
df.isnull().any(axis=0)
df.isnull().sum()
usertime=2748/len(df)
# We eliminate tweet coord, tweet location and user_timezone because there are more than 
# a 25 % of missing values in those variables

### Tweet text cleaning

In [None]:
bcdataset_tmp = df # Dataset before cleaning
df['text']=df['text'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
df['text_clean'] = np.array([precleaning(tweet) for tweet in df['text'] ])
df['grams'] = df.text_clean.apply(ngrams)
df[['grams']].head()
clean_df=df[['text_clean','airline_sentiment','grams']]
clean_df.to_csv('clean_tweet.csv',encoding='utf-8')
csv = 'clean_tweet.csv'
my_df = pd.read_csv(csv,index_col=0)
my_df.head()

### Description spanish dataset clean
#### Most frequent negative tweets

In [None]:
df[(df.airline_sentiment == 'negative')][['grams']].apply(count_words)['grams'].most_common(20)

#### Most frequent positive tweets

In [None]:
df[(df.airline_sentiment == 'positive')][['grams']].apply(count_words)['grams'].most_common(20)

#### Wordclouds for positive tweets

In [None]:
pos_tweets = my_df[my_df.airline_sentiment == 'positive']
pos_string = []
for t in pos_tweets.grams:
    pos_string.append(t)
    
pos_string = pd.Series(pos_string).str.cat(sep=' ')

wc = {'width': 600, 'height': 300, 'random_state': 0}
wordcloud = WordCloud(**wc).generate(pos_string)

plt.imshow(wordcloud)
plt.axis("off")

ignore = set(("numero","euro","euro'","suerte","suerte'",'hola',"hola'",'ano',"ano'",'deseadme',"deseadme'",
              "mejor'",'destino','destinos','europeos',"europeos'",'click',"click'",'aeropuerto',"gracias",
              "gracias'","aeropuerto'",'solo',"solo'",'hacer',"hacer'",'decadas',"decadas'",'dia',"dia'",
              'avion',"avion'",'espana',"espana'",'madrid',"madrid'","spanair",'letal',"letal'",'vuelo',
              'volar',"volar'","vuelo'",'vuelos',"vuelos'", "ma'","ma","q'","si","si'","d'"))
fwc = {'stopwords':ignore,**wc}
wordcloud = WordCloud(**fwc).generate(pos_string)

plt.imshow(wordcloud)
plt.axis("off")


#### Wordclouds for negative tweets

In [None]:
neg_tweets = my_df[my_df.airline_sentiment == 'negative']
neg_string = []
for t in neg_tweets.grams:
    neg_string.append(t)
    
neg_string = pd.Series(neg_string).str.cat(sep=' ')

wc = {'width': 600, 'height': 300, 'random_state': 0}
wordcloud = WordCloud(**wc).generate(neg_string)

plt.imshow(wordcloud)
plt.axis("off")

ignore = set(('aeropuerto',"aeropuerto'",'solo',"solo'",'hacer',"hacer'",'decadas',"decadas'",'dia',"dia'",
              'avion',"avion'",'espana',"espana'",'madrid',"madrid'","spanair",'letal',"letal'",'ryanair', 
              "ryanair'","iberia'",'iberia','vuelo','volar',"volar'","vuelo'",'vuelos',"vuelos'", "ma'","ma",
              "q'","si","si'","d'"))
fwc = {'stopwords':ignore,**wc}
wordcloud = WordCloud(**fwc).generate(neg_string)

plt.imshow(wordcloud)
plt.axis("off")

#### Porcentaje de positivos y negativos

In [None]:
neutral_tweets=my_df[my_df.airline_sentiment == 'neutral']
perc_pos=len(pos_tweets)/(len(pos_tweets)+len(neg_tweets)+len(neutral_tweets))
perc_neg=len(neg_tweets)/(len(pos_tweets)+len(neg_tweets)+len(neutral_tweets))
perc_neutral=len(neutral_tweets)/(len(pos_tweets)+len(neg_tweets)+len(neutral_tweets))

print(perc_pos,perc_neutral,perc_neg)

labels = 'Positive', 'Neutral', 'Negative'
pie = [perc_pos,perc_neutral,perc_neg]
plt.rcParams['font.size'] = 44.0
colors = [sns.xkcd_rgb["pale red"],
          sns.xkcd_rgb["denim blue"],
          sns.xkcd_rgb["medium green"]]
explode = (0.1, 0, 0)  
# Plot
plt.pie(pie, explode=explode, labels=labels, colors=colors,
        autopct='%1.1f%%', shadow=False, radius=0.5)
 
plt.axis('equal')
plt.show()




#### Frecuencias

In [None]:
sentiments = df.groupby('airline_sentiment').size()
sentiments

height = [3788, 2555, 1524]
bars = ('negative', 'neutral', 'positive')
y_pos = np.arange(len(bars))

# Plot horizaontally with bars
plt.bar(y_pos, height, color=[sns.xkcd_rgb["pale red"],
                              sns.xkcd_rgb["denim blue"],
                              sns.xkcd_rgb["medium green"]])
plt.xticks(y_pos, bars)
plt.show()


#### Tweets per day

In [None]:
df_by_date = df.set_index('tweet_created')
df_date_sent = df_by_date[['airline_sentiment']]

sent_onehot = pd.get_dummies(df_date_sent['airline_sentiment'])

df_date_sent = pd.concat((df_date_sent, sent_onehot)).drop('airline_sentiment', axis=1)

sums = df_date_sent.groupby(pd.TimeGrouper('D')).sum()
sm = sums.dropna()
# Lines plot
ax = sm.plot(color=[sns.xkcd_rgb["pale red"], 
                      sns.xkcd_rgb["denim blue"], 
                      sns.xkcd_rgb["medium green"]])
ax.set_ylabel('Number of tweets')
ax.set_xlabel('Date')




### Obtaining data for building a predictive model


In [None]:

df['text_clean'] = np.array([cleaning(tweet) for tweet in df['text'] ])

dataset=obtain_data_representation(df)

#Generate balanced dataset by collecting same number of twits from each group of 
# twits - positive, negative and neutral
am_pos=len(pos_tweets)
am_neg=len(neg_tweets)
am_neutral=len(neutral_tweets)

print(am_pos)
print(am_neg)
print(am_neutral)

neg_tweets_df=df.loc[df.airline_sentiment == 'negative']
neu_tweets_df=df.loc[df.airline_sentiment == 'neutral']
pos_tweets_df=df.loc[df.airline_sentiment == 'positive']

indeces_negative = neg_tweets_df.index.tolist()
indeces_neutral = neu_tweets_df.index.tolist()

inds_negs = defaultdict(list)
checked_neg = []
inds_neus = defaultdict(list)
checked_neu = []

for_selection_neg = indeces_negative
for_selection_neu = indeces_neutral

random.shuffle(for_selection_neg)
random.shuffle(for_selection_neu)

inds_neg = random.sample(for_selection_neg, pos_tweets_df.shape[0])
inds_neu = random.sample(for_selection_neu, pos_tweets_df.shape[0])

neg_df1 = pd.DataFrame()
neu_df1 = pd.DataFrame()
balanced_df = pd.DataFrame()

print(neg_tweets_df.head())
for ng in range(len(inds_neg)):
    indneg = inds_neg[ng]
    neg_df=neg_tweets_df.iloc[neg_tweets_df.index == indneg]
    neg_df1 = neg_df1.append(neg_df)

for nu in range(len(inds_neu)):
    indneu = inds_neu[nu]
    neu_df=neg_tweets_df.iloc[neu_tweets_df.index == indneu]
    neu_df1 = neu_df1.append(neu_df)    

balanced_df = balanced_df.append(pos_tweets_df)
balanced_df = balanced_df.append(neg_df1)
balanced_df = balanced_df.append(neu_df1)

balanced_df['text_clean'] = np.array([cleaning(tweet) for tweet in balanced_df['text'] ])

bdataset=obtain_data_representation(balanced_df)


### Model training

#### Testing models with imbalanced data

In [None]:

modelBNB, _ = train_model(dataset, BernoulliNB)
modelmn, _ = train_model(dataset,  MultinomialNB)
modelrc, _ = train_model(dataset,  RidgeClassifier)
modelp, _ = train_model(dataset,  Perceptron)
modelpac, _ = train_model(dataset,  PassiveAggressiveClassifier)
modelrfc, _ = train_model(dataset,  RandomForestClassifier)
modelMNB, _ = train_modelMNB(dataset)
modelP, _ = train_modelP(dataset)
modelRC, _ = train_modelRC(dataset)
modelPAC, _ = train_modelPAC(dataset)
modelRFC, _ = train_modelRFC(dataset)
modelSGD, _ = train_modelSGD(dataset)
modelSVC, _ = train_modelSVC(dataset)
modelKNC, _ = train_modelKNC(dataset)
modelNCC, _ = train_modelNCC(dataset)
modelDTC, _ = train_modelDTC(dataset)
modelABC, _ = train_modelABC(dataset)
modelETC, _ = train_modelETC(dataset)
modelETC2, _ = train_modelETC2(dataset)


#### Testing models with balanced data


In [None]:
modelBNB, _ = train_model(bdataset, BernoulliNB)
modelmn, _ = train_model(bdataset,  MultinomialNB)
modelrc, _ = train_model(bdataset,  RidgeClassifier)
modelp, _ = train_model(bdataset,  Perceptron)
modelpac, _ = train_model(bdataset,  PassiveAggressiveClassifier)
modelrfc, _ = train_model(bdataset,  RandomForestClassifier)
modelMNB, _ = train_modelMNB(bdataset)
modelP, _ = train_modelP(bdataset)
modelRC, _ = train_modelRC(bdataset)
modelPAC, _ = train_modelPAC(bdataset)
modelRFC, _ = train_modelRFC(bdataset)
modelSGD, _ = train_modelSGD(bdataset)
modelSVC, _ = train_modelSVC(bdataset)
modelKNC, _ = train_modelKNC(bdataset)
modelNCC, _ = train_modelNCC(bdataset)
modelDTC, _ = train_modelDTC(bdataset)
modelABC, _ = train_modelABC(bdataset)
modelETC, _ = train_modelETC(bdataset)
modelETC2, _ = train_modelETC2(bdataset)

#### Testing models with data before cleaning

In [None]:
bcdataset = obtain_raw_data_representation(bcdataset_tmp)

modelBNB, _ = train_model(bcdataset, BernoulliNB)
modelmn, _ = train_model(bcdataset,  MultinomialNB)
modelrc, _ = train_model(bcdataset,  RidgeClassifier)
modelp, _ = train_model(bcdataset,  Perceptron)
modelpac, _ = train_model(bcdataset,  PassiveAggressiveClassifier)
modelrfc, _ = train_model(bcdataset,  RandomForestClassifier)
modelMNB, _ = train_modelMNB(bcdataset)
modelP, _ = train_modelP(bcdataset)
modelRC, _ = train_modelRC(bcdataset)
modelPAC, _ = train_modelPAC(bcdataset)
modelRFC, _ = train_modelRFC(bcdataset)
modelSGD, _ = train_modelSGD(bcdataset)
modelSVC, _ = train_modelSVC(bcdataset)
modelKNC, _ = train_modelKNC(bcdataset)
modelNCC, _ = train_modelNCC(bcdataset)
modelDTC, _ = train_modelDTC(bcdataset)
modelABC, _ = train_modelABC(bcdataset)
modelETC, _ = train_modelETC(bcdataset)
modelETC2, _ = train_modelETC2(bcdataset)