In [1]:
import pandas as pd         
import matplotlib.pyplot as plt
from utilities import  create_bank_col, clean_tweet
#Reading data and models
import glob               
import os
import re
import numpy as np
import datetime as dt

## Load in Twitter dataset

In [2]:
# # Read the path
# dir_datasets = os.getenv("DATASETS_PATH")
# directory = f"{dir_datasets}/tweets_of_the_top_5_banks_in_SA/"

# # Use os.listdir() to get a list of all files in the directory
# files = os.listdir(directory)

# # Use a list comprehension to filter out only CSV files
# csv_files = [file for file in files if file.endswith(".csv")]

# # Initialize an empty list to store the dataframes
# df_list = []

# # Iterate over the CSV files and read them into pandas dataframes
# for file in csv_files:
#     file_path = os.path.join(directory, file)
#     df = pd.read_csv(file_path)
#     df_list.append(df)

# # Concatenate the dataframes
# tweets_df = pd.concat(df_list)

In [3]:
# load parquet file
tweets_df = pd.read_parquet("tweets_from_2019-01-01_to_2019-01-02.parquet")

## Exploratory Data Analysis

In [4]:
# Top records of dataset
tweets_df.head()

Unnamed: 0,Datetime,Tweet_Id,Tweet,Username,Reply_Count,Retweet_Count,Like_Count,Bank
0,2019-01-01 23:32:42+00:00,1080245458730184704,@SlowbucksAce Trippin ü§¶üèΩ‚Äç‚ôÇÔ∏è gotta give it to g...,fnb_justo,0,0,3,fnb
1,2019-01-01 22:36:31+00:00,1080231323552411650,"Fake news or nah, I needed to see that SMS fro...",BangDulamo_ZA,0,0,0,fnb
2,2019-01-01 22:29:20+00:00,1080229514712678400,Them ‚ìÇÔ∏è's coming in I let 'em stack up üí∞ Don't...,fnb_justo,0,1,11,fnb
3,2019-01-01 21:55:39+00:00,1080221037395169280,FNB is so annoying with their unauthorized deb...,babyLangah,0,0,0,fnb
4,2019-01-01 21:38:11+00:00,1080216641684819968,@hothaata Forget the critics let's start 2019 ...,NavasExpert,0,0,0,fnb


In [5]:
# Columns and features in data
tweets_df.columns

Index(['Datetime', 'Tweet_Id', 'Tweet', 'Username', 'Reply_Count',
       'Retweet_Count', 'Like_Count', 'Bank'],
      dtype='object')

In [6]:
# Length of dataset
print('Length of data is', len(tweets_df))

Length of data is 405


In [7]:
# Dataset information
tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 405 entries, 0 to 39
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype              
---  ------         --------------  -----              
 0   Datetime       405 non-null    datetime64[ns, UTC]
 1   Tweet_Id       405 non-null    int64              
 2   Tweet          405 non-null    string             
 3   Username       405 non-null    string             
 4   Reply_Count    405 non-null    int64              
 5   Retweet_Count  405 non-null    int64              
 6   Like_Count     405 non-null    int64              
 7   Bank           405 non-null    string             
dtypes: datetime64[ns, UTC](1), int64(4), string(3)
memory usage: 28.5 KB


In [8]:
# Checking for Null values
print(f'Number of Null values: {np.sum(tweets_df.isnull().any(axis=1))}')

Number of Null values: 0


In [9]:
# cleaned_tweets, hashtags = clean_tweet(tweets_df['base_tweet'])
# # Add the cleaned tweets and hashtags columns to the dataframe
# tweets_df['clean_tweet'] = cleaned_tweets
# tweets_df['hashtags'] = hashtags

In [10]:
import re
import numpy as np
import emoji

def translate_emoji(tweet):
    if tweet == None or tweet == "":
      tweet = tweet
    else:
      tweet = emoji.demojize(tweet).replace(":", "").replace("_", " ")
    return tweet

def remove_email(tweet):
    email = re.compile(r'[\w\.-]+@[\w\.-]+')
    return email.sub(r'',tweet)
     
def clean_tweet(tweets):
    cleaned_tweets = []
    hashtags = []
    for tweet in tweets:
        
        # Remove mentions
        tweet = re.sub(r'@\w+', '', tweet)
        
        # Remove links
        tweet = re.sub(r'http\S+', '', tweet)
        tweet = re.sub(r'www\S+', '', tweet)
        tweet = re.sub(r'bit.ly/\S+', '', tweet) # remove bitly links
        tweet = tweet.strip('[link]') # remove [links]
        
        # Remove email address
        tweet = remove_email(tweet)
        
        # Translate emojis
        tweet = translate_emoji(tweet)
        
        # Capture hashtags
        hashtag_list = re.findall(r'#\w+', tweet)
        if len(hashtag_list) == 0:
            hashtag_list = []
            
        hashtags.append(hashtag_list[1:])
        
        # Remove hashtags
        tweet = re.sub(r"#(\w+)", '', tweet)
        
        # Remove &amp
        tweet = re.sub(r'&amp ', '', tweet)
        
        # Remove special characters
        tweet = re.sub('([_]+)', "", tweet)
        
        # remove any unnecessary spaces
        tweet = " ".join(tweet.split())
        
        tweet = "".join(i for i in tweet if ord(i)<128)
        
        cleaned_tweets.append(tweet)
    return cleaned_tweets, hashtags

cleaned_tweets, hashtags = clean_tweet(tweets_df['Tweet'])
# Add the cleaned tweets and hashtags columns to the dataframe
tweets_df['clean_tweet'] = cleaned_tweets
tweets_df['hashtags'] = hashtags

In [11]:
tweets_df[['Tweet', 'clean_tweet', 'hashtags']].head(50)

Unnamed: 0,Tweet,clean_tweet,hashtags
0,@SlowbucksAce Trippin ü§¶üèΩ‚Äç‚ôÇÔ∏è gotta give it to g...,Trippin man facepalming medium skin tone gotta...,[]
1,"Fake news or nah, I needed to see that SMS fro...","Fake news or nah, I needed to see that SMS fro...",[]
2,Them ‚ìÇÔ∏è's coming in I let 'em stack up üí∞ Don't...,Them circled M's coming in I let 'em stack up ...,[]
3,FNB is so annoying with their unauthorized deb...,FNB is so annoying with their unauthorized deb...,[]
4,@hothaata Forget the critics let's start 2019 ...,Forget the critics let's start 2019 on a high ...,[]
5,@MyChisha @Magzzy4 Lol the sequence is FNB mes...,"Lol the sequence is FNB messages ""zangena"" a L...",[]
6,I don't watch Tennis that much buh i just like...,I don't watch Tennis that much buh i just like...,[]
7,üòè Here‚Äôs my year in emoji: #HappyNewYear #MyEm...,smirking face Heres my year in emoji,[#MyEmojiYear]
8,"Lol, I check my FNB app everyday.","Lol, I check my FNB app everyday.",[]
9,üó£üó£üó£üó£Best all purpose player in the nation @H_H...,speaking headspeaking headspeaking headspeakin...,[]
