## Import the necessary libraries

In [1]:
import pandas as pd
import numpy as np
import nltk
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.metrics import f1_score

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## Load in your data from kaggle.  
By working in a kaggle kernel, you can access the data directly from the competition, as well as make your submission without downloading your output file

In [2]:
train = pd.read_csv('../input/climate-change-edsa2020-21/train.csv')
test = pd.read_csv('../input/climate-change-edsa2020-21/test.csv')

In [3]:
train.sentiment.value_counts()

 1    8530
 2    3640
 0    2353
-1    1296
Name: sentiment, dtype: int64

## Splitting out the X variable from the target

In [47]:
y = train['sentiment']
X = train['message']

## Turning text into something your model can read

In [48]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=3, stop_words="english")
X_vectorized = vectorizer.fit_transform(X)

## Splitting the training data into a training and validation set

In [49]:
X_train,X_val,y_train,y_val = train_test_split(X_vectorized,y,test_size=.2,shuffle=True, stratify=y, random_state=42)

## Training the model and evaluating using the validation set 

In [50]:
lin = LinearSVC()
lin.fit(X_train, y_train)
lin_pred = lin.predict(X_val)


## Checking the performance of our model on the validation set

In [51]:
f1_score(y_val, lin_pred, average="macro")

0.6497779120451052

## Getting our test set ready 

In [35]:
testx = test['message']
test_vect = vectorizer.transform(testx)

## Making predictions on the test set and adding a sentiment column to our original test df

In [36]:
y_pred = lin.predict(test_vect)
print(test_vect)

  (0, 11415)	0.35281177899113386
  (0, 7329)	0.4846906989238615
  (0, 7317)	0.26495153879095945
  (0, 7209)	0.35948649719733294
  (0, 5934)	0.08371863569704267
  (0, 4855)	0.31457176970353307
  (0, 4854)	0.2994253139274821
  (0, 4498)	0.37450758866907347
  (0, 2706)	0.06486513576518152
  (0, 2700)	0.06409702036141916
  (0, 2549)	0.26048924841119064
  (0, 2102)	0.15787657743712552
  (0, 1790)	0.06478949343904095
  (1, 13116)	0.5219073911799101
  (1, 11177)	0.34879963029736377
  (1, 11136)	0.5219073911799101
  (1, 9758)	0.3503674945226779
  (1, 5934)	0.09426433992117808
  (1, 4771)	0.43134243341112166
  (1, 2706)	0.07303593944040379
  (1, 2700)	0.07217106758820392
  (1, 1790)	0.07295076875069564
  (2, 13401)	0.4152801506487914
  (2, 10526)	0.37146601740149726
  (2, 6498)	0.444092914353651
  :	:
  (10543, 2700)	0.052006509639160474
  (10543, 1923)	0.315626771283914
  (10543, 1790)	0.05256836271100582
  (10544, 12648)	0.17995229726073914
  (10544, 9845)	0.10698877408362174
  (10544, 5893)	

In [37]:
test['sentiment'] = y_pred

In [38]:
test.head()

Unnamed: 0,message,tweetid,sentiment
0,Europe will now be looking to China to make su...,169760,1
1,Combine this with the polling of staffers re c...,35326,1
2,"The scary, unimpeachable evidence that climate...",224985,1
3,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,476263,1
4,RT @FakeWillMoore: 'Female orgasms cause globa...,872928,0


## Creating an output csv for submission

In [39]:
test[['tweetid','sentiment']].to_csv('testsubmission6.csv', index=False)