In [1]:
import tensorflow as tf
import numpy as np
from numpy.random import seed
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from random import randint
import os
import nltk
import string
import regex as re
import emoji

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer


from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn import metrics
from sklearn.metrics import make_scorer
f1 = make_scorer(f1_score , average='macro')

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from mlxtend.classifier import StackingCVClassifier

from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.random import set_seed
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier


class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]

2021-09-26 22:07:09.217987: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/conda/lib
2021-09-26 22:07:09.218097: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


## Import Datasets

In [2]:
train_df = pd.read_csv('../input/climate-change-tweet-data/train.csv')
test_df = pd.read_csv('../input/climate-change-tweet-data/test_with_no_labels.csv')

pd.options.display.max_colwidth = 250
pd.options.display.width = None
train_df.head(20)

Unnamed: 0,sentiment,message,tweetid
0,1,"PolySciMajor EPA chief doesn't think carbon dioxide is main cause of global warming and.. wait, what!? https://t.co/yeLvcEFXkC via @mashable",625221
1,1,It's not like we lack evidence of anthropogenic global warming,126103
2,2,RT @RawStory: Researchers say we have three years to act on climate change before it’s too late https://t.co/WdT0KdUr2f https://t.co/Z0ANPT…,698562
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year in the war on climate change https://t.co/44wOTxTLcD,573736
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, sexist, climate change denying bigot is leading in the polls. #ElectionNight",466954
5,1,Worth a read whether you do or don't believe in climate change https://t.co/ggLZVNYjun https://t.co/7AFE2mAH8j,425577
6,1,RT @thenation: Mike Pence doesn’t believe in global warming or that smoking causes lung cancer. https://t.co/gvWYaauU8R,294933
7,1,"RT @makeandmendlife: Six big things we can ALL do today to fight climate change, or how to be a climate activistÃ¢â‚¬Â¦ https://t.co/TYMLu6DbNM hÃ¢â‚¬Â¦",992717
8,1,"@AceofSpadesHQ My 8yo nephew is inconsolable. He wants to die of old age like me, but will perish in the fiery hellscape of climate change.",664510
9,1,RT @paigetweedy: no offense… but like… how do you just not believe… in global warming………,260471


In [3]:
X_tv, X_test, y_tv, y_test = train_test_split(train_df['message'],train_df['sentiment'],test_size = 0.1, random_state = 42)
X_train, X_val, y_train, y_val = train_test_split(X_tv, y_tv, test_size = 0.2, random_state = 42)
print(X_train.head())

9178      Lol love this: climate change's a threat to Mattis bc it'll make terrorizing the mideast to secure US oil assets a… https://t.co/6c4JZD4W8x
6326      @realDonaldTrump Why have you forbid our scientists to talk about climate change? Why remove water regulations? Do… https://t.co/EBzi09iiIH
13534    Have fun enjoying your President who doesn't believe in climate change. Educate yourself because your children's lives depend on it Ã°Å¸ËœÅ 
4792                         RT @KenyaCIC: clear evidence of climate change join the movement #CLP17 let's make a change @SustainAfri @ClimateLaunch…
8885     RT @bradplumer: The Energy Department has rejected Trump’s request to identify employees who worked on climate change: https://t.co/VjVUe2Y…
Name: message, dtype: object


## Preprocess Data

In [4]:
def CleanTweets(list_of_tweets):
    clean_tweets = []
    lemmer=WordNetLemmatizer()
    
    for tweet in list_of_tweets:
        tweet = re.sub("@[A-Za-z0-9]+","",tweet)                          #Remove @ sign
        tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet)  #Removes full http links
        tweet = re.sub(r'htt\S+', '', tweet)                              #Remove partial http links
        tweet = re.sub("\d+", " ", tweet)                                #Remove numbers
        tweet = " ".join(tweet.split())
        tweet = ''.join(c for c in tweet if c not in emoji.UNICODE_EMOJI) #Remove Emojis
        tweet = tweet.replace("#", "").replace("_", " ")                  #Remove hashtag sign but keep the text
        tweet = tweet.replace("&amp;", "and").replace('Ã¢â‚¬Â¦','...')    #Removes weird encoding issue

        tweet = re.sub(r"won\'t", "will not", tweet)                      #Remove all contractions
        tweet = re.sub(r"can\'t", "can not", tweet)

        tweet = re.sub(r"n\'t", " not", tweet)
        tweet = re.sub(r"\'re", " are", tweet)
        tweet = re.sub(r"\'s", " is", tweet)
        tweet = re.sub(r"\'d", " would", tweet)
        tweet = re.sub(r"\'ll", " will", tweet)
        tweet = re.sub(r"\'t", " not", tweet)
        tweet = re.sub(r"\'ve", " have", tweet)
        tweet = re.sub(r"\'m", " am", tweet)

        tweet = tweet.encode('ascii', errors='replace').decode('utf8', errors='replace').encode('ascii', errors='replace').decode('utf8', errors='replace')
                 
        tweet = ' '.join([lemmer.lemmatize(word) for word in tweet.split(' ')])    
            
        clean_tweets.append(tweet)
                    
        clean_tweets = [str(tweet) for tweet in clean_tweets]
    return clean_tweets

In [5]:
#X_train_clean = pd.DataFrame(CleanTweets(X_train))
#X_val_clean = pd.DataFrame(CleanTweets(X_val))
#X_tv_clean = pd.DataFrame(CleanTweets(X_tv))

X_full_clean = pd.DataFrame(CleanTweets(train_df['message']))
X_test_clean = pd.DataFrame(CleanTweets(test_df['message']))

In [6]:
#vect = CountVectorizer(max_features=15000, stop_words='english', analyzer='word', ngram_range=(1, 2))
#vectVal = CountVectorizer(max_features=15000, stop_words='english', analyzer='word', ngram_range=(1, 2))
vectTest = CountVectorizer(max_features=20000, stop_words='english', analyzer='word', ngram_range=(1, 2))


#X_t = vect.fit_transform(X_train_clean.stack()).toarray()
#X_tv = vectVal.fit_transform(X_tv_clean.stack()).toarray()
X_full = vectTest.fit_transform(X_full_clean.stack()).toarray()
X_test_full = vectTest.transform(X_test_clean.stack()).toarray()

In [7]:
def create_model(lyrs=[8], act='linear', opt='Adam', dr=0.0):
    seed(42)
    set_seed(42)
    
    model = Sequential()
    
    
    ##### INPUT DIMS ######
    model.add(Dense(lyrs[0], input_dim=X_full.shape[1], activation=act))
    
    for i in range(1,len(lyrs)):
        model.add(Dense(lyrs[i], activation=act))
    
    # add dropout, default is none
    model.add(Dropout(dr))
    
    # create output layer
    model.add(Dense(4, activation='softmax'))  # output layer
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

## Validate Models

In [8]:
## NAIVE BAYES

mnb_param_grid = {'alpha': [0.7]}
grid_mnb = GridSearchCV(MultinomialNB(), mnb_param_grid, scoring = f1, cv = 5, verbose = 10)

grid_mnb.fit(X_t, y_train)

grid_mnb.cv_results_
grid_mnb.best_estimator_

NameError: name 'X_t' is not defined

In [None]:
## RANDOM FOREST

rf_param_grid = {'max_depth': [100],
 'max_features': ['auto'],
 'min_samples_leaf': [1],
 'min_samples_split': [2],
 'n_estimators': [800]}
grid_rf = GridSearchCV(RandomForestClassifier(class_weight = 'balanced'), rf_param_grid, scoring = f1, cv = 5, verbose = 10)

grid_rf.fit(X_t, y_train)

grid_rf.cv_results_
grid_rf.best_estimator_

In [None]:
## KNN

knn_param_grid = {'n_neighbors' : [1]}
grid_knn = GridSearchCV(KNeighborsClassifier(algorithm = 'brute'), knn_param_grid, scoring = f1, cv = 5, verbose = 10)

grid_knn.fit(X_t, y_train)

grid_knn.cv_results_
grid_knn.best_estimator_

In [None]:
## NEURAL NET 
model = KerasClassifier(build_fn=create_model, verbose=0)


nn_param_grid = {'batch_size' : [256],
                    'epochs' : [50]}

grid_nn = GridSearchCV(model, nn_param_grid, scoring = f1, cv = 5, verbose = 10)

grid_nn.fit(X_t, y_train)

grid_nn.cv_results_
grid_nn.best_estimator_

In [None]:
## LOGISTIC REGRESSION

lr_param_grid = {'penalty': ['l2'], 'C': [1]}

grid_lr = GridSearchCV(LogisticRegression(class_weight = 'balanced', solver = 'newton-cg'), lr_param_grid, scoring = f1, cv = 5, verbose = 10)

grid_lr.fit(X_t, y_train)

grid_lr.cv_results_
grid_lr.best_estimator_

## Build models and stacking classifier

In [None]:
clf1 = KNeighborsClassifier(n_neighbors=1, algorithm = 'brute')
clf2 = RandomForestClassifier(max_depth = 100, max_features ='auto',min_samples_leaf = 1, min_samples_split = 2, n_estimators = 800, class_weight = 'balanced')
clf3 = MultinomialNB(alpha = 0.7)
clf4 = LogisticRegression(solver = 'newton-cg', penalty = 'l2', C = 1, class_weight = 'balanced')
clf5 = KerasClassifier(build_fn=create_model, verbose=0, batch_size = 256, epochs = 50)
clf5._estimator_type = "classifier"

lr = LogisticRegression()

vclf = VotingClassifier(estimators = [('KNN',clf1), 
                                        ('Random Forest',clf2), 
                                        ('Naive Bayes', clf3), 
                                        ('Logistic Regression', clf4), 
                                        ('Neural Network', clf5)
                                       ], verbose = 10, n_jobs = -1)


#sclf = StackingClassifier(estimators = [#('KNN',clf1), 
#                                        #('Random Forest',clf2), 
##                                        ('Naive Bayes', clf3), 
#                                        ('Logistic Regression', clf4), 
#                                        ('Neural Network', clf5)
#                                       ],
#                            final_estimator=lr, verbose = 10, n_jobs = -1)


In [None]:
for clf, label in zip([clf1, clf2, clf3, clf4, clf5,sclf], #   
                      ['KNN', 'Random Forest', 'Naive Bayes', 'Logistic Regression', 'NeuralNetwork',
                       'StackingClassifier']):

    scores = cross_val_score(clf, X_tv, y_tv, cv = 5, scoring= f1, verbose = 10, method='predict')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
    
#Accuracy: 0.46 (+/- 0.01) [KNN]
#Accuracy: 0.58 (+/- 0.01) [Random Forest]
#Accuracy: 0.61 (+/- 0.01) [Naive Bayes]
#Accuracy: 0.63 (+/- 0.01) [Logistic Regression]
#Accuracy: 0.61 (+/- 0.01) [Neural Network]


In [None]:
clf2.fit(X_full, train_df['sentiment'])

In [None]:
ypred = clf2.predict(X_test_full)

In [None]:
clf4.fit(X_full, train_df['sentiment'])

In [None]:
ypred = clf4.predict(X_test_full)

In [None]:
clf3.fit(X_full, train_df['sentiment'])

In [None]:
ypred = clf3.predict(X_test_full)

In [None]:
clf5.fit(X_full, train_df['sentiment'])

In [None]:
ypred = clf5.predict(X_test_full)

In [None]:
from mlxtend.classifier import EnsembleVoteClassifier
import copy
eclf = EnsembleVoteClassifier(clfs=[clf2, clf3, clf4, clf5], weights=[1,1,1,1], fit_base_estimators=False)
eclf.fit(X_full, train_df['sentiment'])

In [None]:
ypred = eclf.predict(X_test_full)

In [None]:
results = pd.DataFrame(test_df.tweetid)
results['sentiment'] = ypred
results

In [None]:
cwd = os.getcwd()
path = cwd + "/Submission26Sept5.csv"
results.to_csv(path, index = False)

In [None]:
cwd
