### Import Library

In [11]:
import re
import pandas as pd
from sklearn.model_selection import train_test_split

### Train and Test Dataset - Sentiment

In [12]:
df1 = pd.read_csv('../Datasets/TwitterSentiment.csv')
df1.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,1,44953,NYC,02-03-2020,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative
1,2,44954,"Seattle, WA",02-03-2020,When I couldn't find hand sanitizer at Fred Me...,Positive
2,3,44955,,02-03-2020,Find out how you can protect yourself and love...,Extremely Positive
3,4,44956,Chicagoland,02-03-2020,#Panic buying hits #NewYork City as anxious sh...,Negative
4,5,44957,"Melbourne, Victoria",03-03-2020,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral


In [13]:
df2 = pd.read_csv('../Datasets/AirlineSentiment.csv')
df2.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [14]:
# remove redundant usernames from the second dataset

remove_usernames = ["@VirginAmerica", "@united", "@SouthwestAir", "@JetBlue", "@USAirways", "@AmericanAir"]
remove_pattern = rf"({'|'.join(re.escape(name) for name in remove_usernames)})"
pattern = re.compile(remove_pattern, re.IGNORECASE)

df2['text'] = df2['text'].apply(lambda x: pattern.sub('', x))

In [15]:
# Keep only three sentiments: Negative, Neutral and Positive

sentiment_map = {
    "Extremely Negative": "negative",
    "Negative": "negative",
    "Neutral": "neutral",
    "Positive": "positive",
    "Extremely Positive": "positive",
}
target_map = {
    "negative": 0,
    "positive": 1,
    "neutral": 2,
}

In [16]:
df1["Sentiment"] = df1["Sentiment"].map(sentiment_map)

df1["Target"] = df1["Sentiment"].map(target_map)
df2["Target"] = df2["airline_sentiment"].map(target_map)

In [17]:
df11 = df1[['OriginalTweet', 'Target']].copy()
df11.columns = ['sentence', 'label']

df22 = df2[['text', 'Target']].copy()
df22.columns = ['sentence', 'label']

In [18]:
df = pd.concat([df11, df22], ignore_index=True).sample(
    frac=1, ignore_index=True, random_state=42
)
df.head()

Unnamed: 0,sentence,label
0,thanks for prompt response. Another hour to ...,1
1,Shopping for some people in my neighborhood th...,1
2,covid-19: you're gonna have to work from home\...,0
3,The price of a gallon of unleaded gas in some ...,2
4,If the government sends me money...not only wi...,0


In [19]:
temp, df_test = train_test_split(df, test_size=0.01, random_state=24)               # test dataset
df_train, df_val = train_test_split(temp, test_size=0.3/0.99, random_state=24)      # train and validation dataset

In [20]:
df_train.to_csv("../Datasets/TrainSentiment.csv", index=False)
df_val.to_csv("../Datasets/ValidationSentiment.csv", index=False)
df_test.to_csv("../Datasets/TestSentiment.csv", index=False)