# CLIMATE CHANGE BELIEF ANALYSIS 

Author: Pilane Koma

October 2020

## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import nltk
import re,string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

## Load Data from kaggle 

In [None]:
train = pd.read_csv('../input/climate-change-edsa2020-21/train.csv')
test = pd.read_csv('../input/climate-change-edsa2020-21/test.csv')

In [None]:
train.head()

## Split data into y and X

In [None]:
y = train['sentiment']
X = train['message']

In [None]:
X.head()

In [None]:
#Preprocessing Data

from nltk.stem import PorterStemmer
ps = PorterStemmer ()

def prep(txt) :
    
    #make text lower case
    txt = txt.lower() 
    
    #remove hashtags and mentions
    txt =  ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",txt).split()) 
    
    #remove punctation
    txt = txt.translate(str.maketrans('', '', string.punctuation))
    
    #stem words
    words = words=re.split("\\s+",txt)
    stemmed_words=[ps.stem(word=word) for word in words]
    
    return ' '.join(stemmed_words)

In [None]:
#tokenize text
def tknz(txt):
    return re.split("\\s+",txt)

## Vectorizer

In [None]:

vect = CountVectorizer(ngram_range = (1,2),max_df = 0.8,tokenizer=tknz,preprocessor = prep, stop_words = "english")

X_Vec = vect.fit_transform(X)

## Split Data

In [None]:
X_train,X_val,y_train,y_val = train_test_split(X_Vec,y,test_size=0.25, random_state=25)

## Train Model

In [None]:
lrm = LogisticRegression()
lrm.fit(X_train, y_train)
lrm_pred = lrm.predict(X_val)

## Model Performance

In [None]:
f1_score(y_val, lrm_pred, average="macro")

## Test Data

In [None]:
testx = test['message']
test_vec = vect.transform(testx)

In [None]:
y_pred = lrm.predict(test_vec)

In [None]:
test['sentiment'] = y_pred

## Submission

In [None]:
test[['tweetid','sentiment']].to_csv('testsubmission.csv', index=False)