In [0]:
# !pip install contractions

In [0]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-deep')

#text preprocessing
import re
import string
import contractions
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from textblob import Word

In [0]:
train = pd.read_csv("https://raw.githubusercontent.com/monicafar147/classification-predict-streamlit-template/master/climate-change-belief-analysis/train.csv")
test = pd.read_csv("https://raw.githubusercontent.com/monicafar147/classification-predict-streamlit-template/master/climate-change-belief-analysis/test.csv")

In [187]:
print("Train\n")
print(train.head(5))
print("\nTest")
print(test.head(5))

Train

   sentiment                                            message  tweetid
0          1  PolySciMajor EPA chief doesn't think carbon di...   625221
1          1  It's not like we lack evidence of anthropogeni...   126103
2          2  RT @RawStory: Researchers say we have three ye...   698562
3          1  #TodayinMaker# WIRED : 2016 was a pivotal year...   573736
4          1  RT @SoyNovioDeTodas: It's 2016, and a racist, ...   466954

Test
                                             message  tweetid
0  Europe will now be looking to China to make su...   169760
1  Combine this with the polling of staffers re c...    35326
2  The scary, unimpeachable evidence that climate...   224985
3  @Karoli @morgfair @OsborneInk @dailykos \nPuti...   476263
4  RT @FakeWillMoore: 'Female orgasms cause globa...   872928


In [0]:
def _preprocess(data):
  df = data.copy()

  # apply lowercase to data
  data['message'] = data['message'].apply(lambda word: ''.join(word.lower()))

  # function to remove contraction
  def remove_contraction(row):
    fixed = [contractions.fix(word) for word in row.split()]
    return ' '.join(map(str,fixed))

  # replace contractions
  df['message'] = np.vectorize(remove_contraction)(df['message'])

  # function to remove patterns
  def remove_pattern(text,pattern,replacement=''):
    remove_this = re.findall(pattern, text)
    for item in remove_this:
      text = re.sub(item, replacement, text)
    return text

  # remove hashtags
  # df['message'] = np.vectorize(remove_pattern)(df['message'],"#[\w]*")

  # remove username
  # df['message'] = np.vectorize(remove_pattern)(df['message'],"@[\w]*")

  # remove URL
  # df['message'] = df['message'].apply(lambda word: re.split('https:\/\/.*', str(word))[0])

  # remove punctuation
  df['message'] = df['message'].apply(lambda word: word.translate(str.maketrans('', '', string.punctuation)))

  # remove stopwords
  stop_words = stopwords.words('english')
  data['message'] = data['message'].apply(lambda word: ' '.join(word for word in word.split() if word not in stop_words))

  # remove retweet as rt
  df['message'] = np.vectorize(remove_pattern)(df['message'],"RT[\w]*")

  return df

In [0]:
trained = train[['sentiment','message','tweetid']]
tested = test[['message','tweetid']]

In [0]:
# import nltk
# nltk.download('stopwords')

In [0]:
# Using the preprocess function we created
train_processed = _preprocess(trained)
test_processed = _preprocess(tested)

In [192]:
# print out cleaned text
for_msg = [word for word in train_processed['message']]
index = 1
for tweet in for_msg[0:10]:
    print(str(index)+": " + tweet)
    print('\n')
    index += 1

1: PolySciMajor EPA chief does not think carbon dioxide is main because of global warming and wait what httpstcoyeLvcEFXkC via mashable


2: it is not like we lack evidence of anthropogenic global warming


3:  RawStory Researchers say we have three years to act on climate change before it is too late httpstcoWdT0KdUr2f httpstcoZ0ANPT…


4: TodayinMaker WIRED  2016 was a pivotal year in the war on climate change httpstco44wOTxTLcD


5:  SoyNovioDeTodas it is 2016 and a racist sexist climate change denying bigot is leading in the polls ElectionNight


6: Worth a read whether you do or do not believe in climate change httpstcoggLZVNYjun httpstco7AFE2mAH8j


7:  thenation Mike Pence does not believe in global warming or that smoking causes lung cancer httpstcogvWYaauU8R


8:  makeandmendlife Six big things we can ALL do today to fight climate change or how to be a climate activistÃ¢â‚¬Â¦ httpstcoTYMLu6DbNM hÃ¢â‚¬Â¦


9: AceofSpadesHQ My 8yo nephew is inconsolable He wants to die of old ag

In [0]:
from sklearn.model_selection import train_test_split
# Splitting the labels and features
X = train_processed['message']
y = train_processed['sentiment']
# Splitting the labels and fetures into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)

In [0]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

In [195]:
text_cls = Pipeline([('tfidf',TfidfVectorizer()),('classify',LinearSVC(C=1))])
text_cls.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('classify',
                 LinearSVC(C=1, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
            

In [196]:
pred = text_cls.predict(X_test)
test_processed

Unnamed: 0,message,tweetid
0,Europe will now be looking to China to make su...,169760
1,Combine this with the polling of staffers re c...,35326
2,The scary unimpeachable evidence that climate ...,224985
3,Karoli morgfair OsborneInk dailykos Putin got ...,476263
4,FakeWillMoore Female orgasms because global w...,872928
...,...,...
10541,BrittanyBohrer Brb writing a poem about clima...,895714
10542,2016 the year climate change came home During ...,875167
10543,loopvanuatu Pacific countries positive about ...,78329
10544,xanria00018 you are so hot you must be the be...,867455


In [0]:
x_unseen = test_processed['message']
y_prediction = text_cls.predict(x_unseen)

In [198]:
from sklearn.metrics import confusion_matrix,classification_report
report_1 = print(classification_report(y_test, pred))
report_1

              precision    recall  f1-score   support

          -1       0.74      0.50      0.59       278
           0       0.56      0.43      0.49       425
           1       0.78      0.86      0.82      1755
           2       0.75      0.77      0.76       706

    accuracy                           0.75      3164
   macro avg       0.71      0.64      0.66      3164
weighted avg       0.74      0.75      0.74      3164



# Testing

In [199]:
train

Unnamed: 0,sentiment,message,tweetid
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221
1,1,It's not like we lack evidence of anthropogeni...,126103
2,2,RT @RawStory: Researchers say we have three ye...,698562
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954
...,...,...,...
15814,1,RT @ezlusztig: They took down the material on ...,22001
15815,2,RT @washingtonpost: How climate change could b...,17856
15816,0,notiven: RT: nytimesworld :What does Trump act...,384248
15817,-1,RT @sara8smiles: Hey liberals the climate chan...,819732


In [200]:
test

Unnamed: 0,message,tweetid
0,Europe will now be looking to China to make su...,169760
1,Combine this with the polling of staffers re c...,35326
2,"The scary, unimpeachable evidence that climate...",224985
3,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,476263
4,RT @FakeWillMoore: 'Female orgasms cause globa...,872928
...,...,...
10541,"RT @BrittanyBohrer: Brb, writing a poem about ...",895714
10542,2016: the year climate change came home: Durin...,875167
10543,RT @loop_vanuatu: Pacific countries positive a...,78329
10544,"RT @xanria_00018: You’re so hot, you must be t...",867455


In [201]:
test_df = test.copy()
X_test_df = test_df['message']
X_test_df.head(5)

0    Europe will now be looking to China to make su...
1    Combine this with the polling of staffers re c...
2    The scary, unimpeachable evidence that climate...
3    @Karoli @morgfair @OsborneInk @dailykos \nPuti...
4    RT @FakeWillMoore: 'Female orgasms cause globa...
Name: message, dtype: object

In [202]:
pred_2 = text_cls.predict(X_test_df)
pred_2

array([1, 1, 1, ..., 1, 0, 0])

In [203]:
test_df['sentiment'] = pred_2
test_df

Unnamed: 0,message,tweetid,sentiment
0,Europe will now be looking to China to make su...,169760,1
1,Combine this with the polling of staffers re c...,35326,1
2,"The scary, unimpeachable evidence that climate...",224985,1
3,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,476263,1
4,RT @FakeWillMoore: 'Female orgasms cause globa...,872928,2
...,...,...,...
10541,"RT @BrittanyBohrer: Brb, writing a poem about ...",895714,1
10542,2016: the year climate change came home: Durin...,875167,1
10543,RT @loop_vanuatu: Pacific countries positive a...,78329,1
10544,"RT @xanria_00018: You’re so hot, you must be t...",867455,0


In [204]:
train_df = train.copy()
frames = [train_df, test_df]
result = pd.concat(frames)
result


Unnamed: 0,sentiment,message,tweetid
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221
1,1,It's not like we lack evidence of anthropogeni...,126103
2,2,RT @RawStory: Researchers say we have three ye...,698562
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954
...,...,...,...
10541,1,"RT @BrittanyBohrer: Brb, writing a poem about ...",895714
10542,1,2016: the year climate change came home: Durin...,875167
10543,1,RT @loop_vanuatu: Pacific countries positive a...,78329
10544,0,"RT @xanria_00018: You’re so hot, you must be t...",867455


In [0]:
X_2 = result['message']
y_2 = result['sentiment']
# Splitting the labels and fetures into training and testing sets
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_2, y_2, test_size=0.2,random_state=42)


In [206]:
text_cls_2 = Pipeline([('tfidf',TfidfVectorizer()),('classify',LinearSVC(C=1))])
text_cls_2.fit(X_train_2, y_train_2)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('classify',
                 LinearSVC(C=1, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
            

In [0]:
pred_2 = text_cls_2.predict(X_test)
x_unseen_2 = test_processed['message']
y_prediction_2 = text_cls.predict(x_unseen_2)

In [208]:
from sklearn.metrics import confusion_matrix,classification_report
report_2 = print(classification_report(y_test, pred_2))
report_2

              precision    recall  f1-score   support

          -1       0.91      0.72      0.81       278
           0       0.82      0.80      0.81       425
           1       0.88      0.96      0.91      1755
           2       0.95      0.83      0.88       706

    accuracy                           0.88      3164
   macro avg       0.89      0.83      0.85      3164
weighted avg       0.89      0.88      0.88      3164

