**Import necessary library**

In [88]:
import pandas as pd
import numpy as np
import re
import string
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

**load dataset**

In [89]:
df=pd.read_csv("/content/Tweets.csv")

In [90]:
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [91]:
df.columns

Index(['tweet_id', 'airline_sentiment', 'airline_sentiment_confidence',
       'negativereason', 'negativereason_confidence', 'airline',
       'airline_sentiment_gold', 'name', 'negativereason_gold',
       'retweet_count', 'text', 'tweet_coord', 'tweet_created',
       'tweet_location', 'user_timezone'],
      dtype='object')

In [92]:
df.isnull().sum()

Unnamed: 0,0
tweet_id,0
airline_sentiment,0
airline_sentiment_confidence,0
negativereason,5462
negativereason_confidence,4118
airline,0
airline_sentiment_gold,14600
name,0
negativereason_gold,14608
retweet_count,0


In [93]:
df.shape

(14640, 15)

**Data Cleaning**

In [94]:
df=df[['text','airline_sentiment']]  # only takes this columns because it only used for it

In [95]:
df

Unnamed: 0,text,airline_sentiment
0,@VirginAmerica What @dhepburn said.,neutral
1,@VirginAmerica plus you've added commercials t...,positive
2,@VirginAmerica I didn't today... Must mean I n...,neutral
3,@VirginAmerica it's really aggressive to blast...,negative
4,@VirginAmerica and it's a really big bad thing...,negative
...,...,...
14635,@AmericanAir thank you we got on a different f...,positive
14636,@AmericanAir leaving over 20 minutes Late Flig...,negative
14637,@AmericanAir Please bring American Airlines to...,neutral
14638,"@AmericanAir you have my money, you change my ...",negative


In [96]:
#Text Preprocessing
def clean_text(text):
    text=text.lower()
    text=re.sub(r'http\S+', '', text)
    text=re.sub(r'@\w+', '', text)
    text=re.sub(r'#\w+', '', text)
    text=re.sub(r'[^\w\s]', '', text)
    text=re.sub(r'\d+', '', text)
    text=text.strip()
    return text

df['clean_text']=df['text'].apply(clean_text)
df[['text','clean_text']].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clean_text']=df['text'].apply(clean_text)


Unnamed: 0,text,clean_text
0,@VirginAmerica What @dhepburn said.,what said
1,@VirginAmerica plus you've added commercials t...,plus youve added commercials to the experience...
2,@VirginAmerica I didn't today... Must mean I n...,i didnt today must mean i need to take another...
3,@VirginAmerica it's really aggressive to blast...,its really aggressive to blast obnoxious enter...
4,@VirginAmerica and it's a really big bad thing...,and its a really big bad thing about it


**Feature Extraction with TF-Idf**

In [97]:
x=df['clean_text']
y=df['airline_sentiment']

In [98]:
# Convert text into numerical feature
vectorizer=TfidfVectorizer(max_features=5000)
x_vectorized=vectorizer.fit_transform(x)

**Train/Test Split**

In [99]:
X_train,X_test,y_train,y_test=train_test_split(x_vectorized,y,test_size=0.2,random_state=42,stratify=y)

**Train a Model**

In [103]:
model=LogisticRegression()
model.fit(X_train,y_train)

**Evaluate the Model**

In [104]:
y_pred=model.predict(X_test)

In [105]:
print("Accuracy:",accuracy_score(y_test,y_pred))
print("\nClassification Report:\n",classification_report(y_test, y_pred))

Accuracy: 0.79474043715847

Classification Report:
               precision    recall  f1-score   support

    negative       0.82      0.94      0.88      1835
     neutral       0.67      0.55      0.60       620
    positive       0.83      0.56      0.67       473

    accuracy                           0.79      2928
   macro avg       0.77      0.68      0.72      2928
weighted avg       0.79      0.79      0.78      2928

