### First bring in the necessary import statements to eda and initial modeling to examine the data.

In [148]:
import re
import nltk
import string
import numpy as np
import pandas as pd
import seaborn as sns
from nltk import pos_tag
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
from sklearn.manifold import TSNE
from collections import defaultdict
from mpl_toolkits.mplot3d import Axes3D
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.dummy import DummyClassifier
from nltk.corpus import stopwords, wordnet
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, make_pipeline
from imblearn.pipeline import Pipeline as imbPipeline
from nltk.tokenize import word_tokenize, regexp_tokenize, RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, AdaBoostClassifier
from sklearn.metrics import plot_roc_curve, plot_confusion_matrix, confusion_matrix, classification_report, accuracy_score, precision_score

import os
import sys
sys.setrecursionlimit(100000)
module_path = os.path.abspath(os.pardir)
print(module_path)
if module_path not in sys.path:
    sys.path.append(module_path)
%matplotlib inline
pd.set_option('display.max_colwidth', None)

/Users/jax/Documents/Flatiron


### Bring in the data and examine the head and tail of the set.

In [149]:
org_df = pd.read_csv('data/GW_Sentiment_Tweets.csv')

In [150]:
org_df.count()

tweet                   6027
existence               4187
existence_confidence    6024
dtype: int64

In [151]:
org_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6027 entries, 0 to 6026
Data columns (total 3 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   tweet                 6027 non-null   object 
 1   existence             4187 non-null   object 
 2   existence_confidence  6024 non-null   float64
dtypes: float64(1), object(2)
memory usage: 141.4+ KB


In [152]:
org_df

Unnamed: 0,tweet,existence,existence_confidence
0,"Global warming report urges governments to act|BRUSSELS, Belgium (AP) - The world faces increased hunger and .. [link]",Yes,1.0000
1,Fighting poverty and global warming in Africa [link],Yes,1.0000
2,Carbon offsets: How a Vatican forest failed to reduce global warming [link],Yes,0.8786
3,Carbon offsets: How a Vatican forest failed to reduce global warming [link],Yes,1.0000
4,URUGUAY: Tools Needed for Those Most Vulnerable to Climate Change [link],Yes,0.8087
...,...,...,...
6022,"@bloodless_coup ""The phrase 'global warming' should be abandoned in favor of 'climate change', Luntz says,"" http://bit.ly/7bIY0c #p2 #tcot",Y,1.0000
6023,Virginia to Investigate Global Warming Scientist Mann: http://bit.ly/aDlavg,,1.0000
6024,Global warming you tube parody you will enjoy #IPCC #ocra http://bit.ly/bBGWhC,N,0.6411
6025,One-Eyed Golfer: Don't dare tell me about global warming: Twenty-five of the forty-nine golfers making the cut thi... http://bit.ly/akeAxp,N,1.0000


### From here you can already see there are duplicate rows of tweets strangely with different existence values.

In [153]:
num_unique = (~org_df.tweet.duplicated()).sum()
num_unique

5481

In [154]:
num_duplicate = org_df.tweet.duplicated().sum()
num_duplicate

546

### There are 5481 tweets that are not duplicates. Let's extract the duplicate rows.

In [155]:
org_df.loc[org_df.tweet.duplicated(), :]

Unnamed: 0,tweet,existence,existence_confidence
3,Carbon offsets: How a Vatican forest failed to reduce global warming [link],Yes,1.0000
94,"Plants effective way of tackling global warming|Washington, Apr 30 : Plant leaves account for less than one per .. [link]",Yes,0.7925
111,"CLIMATE CHANGE: Forests Not for Absorbing Carbon, Say Activists [link]",,0.6135
112,"CLIMATE CHANGE: Forests Not for Absorbing Carbon, Say Activists [link]",Yes,0.5763
114,"CLIMATE CHANGE: Forests Not for Absorbing Carbon, Say Activists [link]",Yes,0.8243
...,...,...,...
5840,"Global warming ballot initiative: Teamsters and cities weigh in: The California Teamsters, one of the state's most... http://bit.ly/aMYbOS",,0.6346
5903,"http://theclimatedesk.org/ launches, backed by Wired, Atlantic, Mother Jones, Slate, CIR, NPR, to report climate change impacts, responses",Y,0.6424
5999,Natural Resource Econ: Krugman on Climate Change http://bit.ly/b4IyRj,Y,0.6436
6003,"Valero Gas wants to kill AB 32, CA's global warming law. Join @CredoAction & @CourageCampaign to stop them: http://BoycottValero.com Pls RT",,0.7674


In [156]:
unique_df=org_df.drop_duplicates(subset='tweet', keep='first').copy()

In [157]:
unique_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5481 entries, 0 to 6026
Data columns (total 3 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   tweet                 5481 non-null   object 
 1   existence             3822 non-null   object 
 2   existence_confidence  5478 non-null   float64
dtypes: float64(1), object(2)
memory usage: 171.3+ KB


### Now all our duplicates have been removed and we can continue onto our preprocessing of the tweets. First let's work on removing punctuation and numerals. Then let's remove stop words and tokenize.

In [164]:
cust_sw = stopwords.words('english').extend('link')

In [169]:
#Function to clean tweets
def clean_tweet(tweet):
    tweet_unlink= re.sub(r"http\S+",'', tweet)
    # remove numbers re.sub(pattern, repl, string, count=0, flags=0)
    tweet_nonum = re.sub(r'\d+', '', tweet_unlink)
    # remove punctuations and lower case
    tweet_nopunct = ''.join([char.lower() for char in tweet_nonum if char not in string.punctuation])
    #cut stopwords
    word_filt = [tweet.join('') for tweet in tweet_nopunct if tweet not in cust_sw]
    # substitute multiple whitespace with single whitespace
    cleaned_tweet = re.sub('\s+','', word_filt).strip()
    return cleaned_tweet

In [170]:
#Apply cleaning function to all tweets
unique_df['new_tweets']= unique_df['tweet'].apply(clean_tweet)

TypeError: argument of type 'NoneType' is not iterable

In [171]:
#Instatiate tokenizer object and tokenize
tt = TweetTokenizer()
unique_df['tokens']= unique_df['new_tweets'].apply(tt.tokenize)

KeyError: 'new_tweets'

In [None]:
### Now let's look at NaN values in our existence column. They mean that

In [126]:
unique_df.existence.isna().sum()

1659

In [85]:
def tokenize(sentences):
    words = []
    for sentence in sentences:
        w = word_extraction(sentence)
        words.extend(w)
    words = sorted(list(set(words)))
    return words

In [87]:
def tokenize(tweet):
    for text in tweet:
        joined_tweet = ' '.join(text)
        tokenized_tweet = word_tokenize(joined_tweet)
    return tokenized_tweet
tokenized_test_tweet = tokenize(unique_df['tweet'][88])
tokenized_test_tweet

[']']

In [92]:
def tokenize(tweet):
    for text in tweet:pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
tweet_tokens_raw = nltk.regexp_tokenize(org_df['tweet'][0], pattern)
return tokenized_tweet

In [93]:
tweet_tokens = [word.lower() for word in tweet_tokens_raw]
tweet_tokens

['global',
 'warming',
 'report',
 'urges',
 'governments',
 'to',
 'act',
 'brussels',
 'belgium',
 'ap',
 'the',
 'world',
 'faces',
 'increased',
 'hunger',
 'and',
 'link']