In [31]:
"""
    Importing necessary packages.
"""

import re
import preprocessor as p
import emoji
import regex
import string
import nltk
from nltk.tokenize.casual import TweetTokenizer
import pandas as pd


In [32]:
"""
    Opening csv file as a DataFrame
"""

df = pd.read_csv('tweets.csv')

In [None]:
#make sure that the "tweet" column type is string 
df['tweet_original'] = df['tweet_original'].astype(str)

In [None]:
"""
    Returns a list of hashtags for each tweet that contains hashtags.
"""

def hashtags(row):
    text = row['tweet_original']
    pat = re.compile(r"#(\w+)")
    return pat.findall(text)  

df['hashtags'] = df.apply(hashtags, axis=1)

In [34]:
"""
    Encodes user names combining the first 3 characters of the user name and the first 3 characters of the user's location.
"""

def encoding_username(row):
    text1 = row['user']
    text3 = row['location_encoded']
    
    encoded1 = text1[0:3]
    encoded3 = text3[0:3]
    encoded = encoded1 + encoded3
    
    return encoded

df['user_encoded'] = df.apply(encoding_username, axis=1)

In [None]:
"""
    Returns a list of all emoji that each tweet contains.
"""

def all_emoji_tweet(row):
    text = row['tweet_original']
    emoji_list = []
    data = regex.findall(r'\X', text)
    for word in data:
        if any(char in emoji.UNICODE_EMOJI for char in word):
            emoji_list.append(word)

    return emoji_list

df['all_emoji'] = df.apply(all_emoji_tweet, axis=1)

In [1]:
#define tweet features to tokenize (mentions, urls, and hashtags)
p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.HASHTAG)

"""
    For each tweet, tokenizes hashtags, urls, and mentions.
"""

def preprocess_tweet(row):
    text = row['tweet_original']
    text = re.sub(' +', ' ', text)
    text = p.tokenize(text)
   # text = text.replace("$MENTION$", "mention")
   # text = text.replace("$URL$", "url")
   # text = text.replace("$HASHTAG$", "hashtag")
    return text

In [None]:
"""
    Lowercasing for each tweet.
"""

def lowercase_tweet(row):
    text = row['preprocess_tweet']
    text = text.lower()
    return text

In [None]:
#punctuation marks
table = str.maketrans(dict.fromkeys(string.punctuation)) 

"""
    Removes punctuation marks for each tweet.
"""

def remove_punct_tweet(row):
    text = row['lowercase_tweet']
    text = text.translate(table)  
    return text

In [None]:
#selecting tokenizer that can tokenize emoji
t = TweetTokenizer()

"""
    Tokenizes each tweet.
"""

def tokenize_tweet(row):
    text = row['remove_punct_tweet']
    text = t.tokenize(text)
    return text

In [None]:
"""
    Puts all the stage of preprocessing together.
"""

def preprocessed(row):
    text = row['tokenize_tweet']
    return text

In [7]:
"""
    For each tweet, checks in the tweet contains urls.
"""

def url(row):
    text = row['preprocessed']
    substring = "url"
    if substring in text: 
        return "yes"
    else:
        return "no"

df['url_present'] = df.apply(url, axis=1)

In [5]:
"""
    For each tweet, checks in the tweet contains mentions.
"""

def mention(row):
    text = row['preprocessed']
    substring = "mention"
    if substring in text: 
        return "yes"
    else:
        return "no"

df['mention_present'] = df.apply(mention, axis=1)

In [9]:
"""
    For each tweet, checks in the tweet contains hashtags.
"""

def hashtag(row):
    text = row['preprocessed']
    substring = "hashtag"
    if substring in text: 
        return "yes"
    else:
        return "no"

df['hashtag_present'] = df.apply(hashtag, axis=1)