# Importing Libraries

In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import numpy as np 

import matplotlib.pyplot as plt
%matplotlib inline

import re 
import nltk
from nltk.corpus import stopwords
import string

# Loading DataSet

In [2]:
df = pd.read_csv("NASA.csv")
df.drop(df.columns[[1,2,4,5,6,7,8,9,10,11]], axis=1, inplace=True)
df.rename(columns = {'Unnamed: 0':'Tweet ID'}, inplace = True)
df

Unnamed: 0,Tweet ID,Tweet
0,0,"Here's to you, Oppy. 🥂\n\nBefore you say #Good..."
1,1,Are there rivers and lakes on other worlds? Yo...
2,2,We want to hear from you!\n\nJoin our series o...
3,3,"The @NASAExoplanets data hint that WASP-39 b, ..."
4,4,.@NASAWebb just scored another first: a full p...
...,...,...
14105,14105,The supermoon is here! Be sure to bundle up th...
14106,14106,Ever wonder how we track supermoons 🌕 and othe...
14107,14107,"A supermoon is coming! Tonight, the full Moon ..."
14108,14108,Happy New Year from space! Astronauts aboard t...


In [3]:
df['len'] = df['Tweet'].str.len()
df

Unnamed: 0,Tweet ID,Tweet,len
0,0,"Here's to you, Oppy. 🥂\n\nBefore you say #Good...",245
1,1,Are there rivers and lakes on other worlds? Yo...,293
2,2,We want to hear from you!\n\nJoin our series o...,302
3,3,"The @NASAExoplanets data hint that WASP-39 b, ...",175
4,4,.@NASAWebb just scored another first: a full p...,189
...,...,...,...
14105,14105,The supermoon is here! Be sure to bundle up th...,174
14106,14106,Ever wonder how we track supermoons 🌕 and othe...,140
14107,14107,"A supermoon is coming! Tonight, the full Moon ...",255
14108,14108,Happy New Year from space! Astronauts aboard t...,254


In [4]:
df['len'].describe()

count    14110.000000
mean       239.409001
std         75.212052
min          8.000000
25%        208.000000
50%        268.000000
75%        297.000000
max        453.000000
Name: len, dtype: float64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14110 entries, 0 to 14109
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Tweet ID  14110 non-null  int64 
 1   Tweet     14110 non-null  object
 2   len       14110 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 330.8+ KB


# Conversion of Emoji's to Text

In [6]:
import regex
import emoji
import html.parser as html

emoticons = [':-)', ':)', '(:', '(-:', ':))', '((:', ':-D', ':D', 'X-D', 'XD', 'xD', 'xD', '<3', '</3', ':\*',
                 ';-)',
                 ';)', ';-D', ';D', '(;', '(-;', ':-(', ':(', '(:', '(-:', ':,(', ':\'(', ':"(', ':((', ':D', '=D',
                 '=)',
                 '(=', '=(', ')=', '=-O', 'O-=', ':o', 'o:', 'O:', 'O:', ':-o', 'o-:', ':P', ':p', ':S', ':s', ':@',
                 ':>',
                 ':<', '^_^', '^.^', '>.>', 'T_T', 'T-T', '-.-', '*.*', '~.~', ':*', ':-*', 'xP', 'XP', 'XP', 'Xp',
                 ':-|',
                 ':->', ':-<', '$_$', '8-)', ':-P', ':-p', '=P', '=p', ':*)', '*-*', 'B-)', 'O.o', 'X-(', ')-X']


def split_count(text):
    text = html.unescape(text)
    emoji_list = []
    data = regex.findall(r'\X', text)
    for word in data:
        if any(char in emoji.UNICODE_EMOJI['en'] for char in word):
            emoji_list.append(word)
    for word in text.split(' '):
        if word in emoticons :
            emoji_list.append(word)
    return emoji_list

text = df['Tweet']
emoji_list= [] 
for t in text:
    emoji_list=emoji_list+split_count(t)
    
from collections import Counter
print(Counter(emoji_list))

Counter({'🚀': 907, '✨': 253, '👩\u200d🚀': 173, '🔴': 168, '🛰️': 167, '👨\u200d🚀': 158, '🔥': 151, '🌎': 146, '💫': 140, '🤩': 130, '📸': 105, '📺': 95, '☀️': 92, '✅': 90, '🌟': 90, '👀': 89, '🔭': 84, '🌕': 82, '🤔': 76, '😎': 68, '😍': 67, '🎧': 67, '🐉': 67, '🌌': 65, '☄️': 60, '🌊': 60, '🚨': 50, '✈️': 49, '🌍': 47, '➡️': 44, '🔬': 43, '👋': 41, '🚁': 41, '❓': 40, '🌑': 40, '✔️': 40, '🌔': 39, '🥳': 37, '🔊': 37, '😉': 36, '🤖': 36, '📅': 36, '🔗': 36, '🌖': 34, '🙌': 33, '🎶': 32, '▪️': 32, '🌙': 31, '🌞': 31, '🌒': 31, '⏰': 31, '🤓': 30, '🎙️': 29, '📲': 28, '❤️': 26, '🎉': 25, '🏆': 25, '🎥': 25, '☁️': 24, '📍': 24, '📚': 24, '🌝': 23, '🎙': 23, '🗓️': 23, '❄️': 23, '⭐': 23, '🧪': 22, '🔵': 22, '📷': 21, '🔍': 20, '🧑\u200d🚀': 20, '📱': 20, '💨': 19, '💥': 19, '🎨': 19, '👏': 19, '🌀': 19, '🌠': 19, '🔋': 19, '📡': 18, '💪': 18, '💧': 18, '🛰': 17, '🌱': 17, '🏈': 17, '⭐️': 17, '🪐': 16, '💻': 16, '🌏': 16, '⬇️': 16, '👉': 16, '🗳️': 14, '⚪️': 14, '🕒': 14, '🎃': 13, '🖖': 13, '👩🏽\u200d🚀': 13, '⚙️': 13, '💙': 12, '😱': 12, '👍': 12, '⚫': 12, '🌃': 12, '🌚': 12

In [7]:
import re
from emot.emo_unicode import UNICODE_EMOJI
def convert_emojis(text):
    for emot in UNICODE_EMOJI:
        text = text.replace(emot, "_".join(UNICODE_EMOJI[emot].replace(",","").replace(":","").split()))
    return text

df['Tweet'] = df['Tweet'].apply(lambda x: convert_emojis(x))
df

Unnamed: 0,Tweet ID,Tweet,len
0,0,"Here's to you, Oppy. clinking_glasses\n\nBefor...",245
1,1,Are there rivers and lakes on other worlds? Yo...,293
2,2,We want to hear from you!\n\nJoin our series o...,302
3,3,"The @NASAExoplanets data hint that WASP-39 b, ...",175
4,4,.@NASAWebb just scored another first: a full p...,189
...,...,...,...
14105,14105,The supermoon is here! Be sure to bundle up th...,174
14106,14106,Ever wonder how we track supermoons full_moon ...,140
14107,14107,"A supermoon is coming! Tonight, the full Moon ...",255
14108,14108,Happy New Year from space! Astronauts aboard t...,254


# Preprocessing The Data

In [8]:
df.isnull().any()

Tweet ID    False
Tweet       False
len         False
dtype: bool

In [9]:
df['dup'] = df.duplicated(subset=None, keep='first')
del df['dup']


In [10]:
def lower_text(text):
    text = text.lower()
    return text
df['Tweet'] = df['Tweet'].apply(lambda x: lower_text(x))
df

Unnamed: 0,Tweet ID,Tweet,len
0,0,"here's to you, oppy. clinking_glasses\n\nbefor...",245
1,1,are there rivers and lakes on other worlds? yo...,293
2,2,we want to hear from you!\n\njoin our series o...,302
3,3,"the @nasaexoplanets data hint that wasp-39 b, ...",175
4,4,.@nasawebb just scored another first: a full p...,189
...,...,...,...
14105,14105,the supermoon is here! be sure to bundle up th...,174
14106,14106,ever wonder how we track supermoons full_moon ...,140
14107,14107,"a supermoon is coming! tonight, the full moon ...",255
14108,14108,happy new year from space! astronauts aboard t...,254


In [11]:
def remove_html_tags(text):
    html=re.compile(r'<.*?>') 
    text = html.sub(r'',text)
    return text
df['Tweet'] = df['Tweet'].apply(lambda x: remove_html_tags(x))
df

Unnamed: 0,Tweet ID,Tweet,len
0,0,"here's to you, oppy. clinking_glasses\n\nbefor...",245
1,1,are there rivers and lakes on other worlds? yo...,293
2,2,we want to hear from you!\n\njoin our series o...,302
3,3,"the @nasaexoplanets data hint that wasp-39 b, ...",175
4,4,.@nasawebb just scored another first: a full p...,189
...,...,...,...
14105,14105,the supermoon is here! be sure to bundle up th...,174
14106,14106,ever wonder how we track supermoons full_moon ...,140
14107,14107,"a supermoon is coming! tonight, the full moon ...",255
14108,14108,happy new year from space! astronauts aboard t...,254


In [12]:
def remove_tags(text):
    text = re.sub(r"http\S+", "",text)
    return text
df['Tweet'] = df['Tweet'].apply(lambda x:remove_tags(x))
df

Unnamed: 0,Tweet ID,Tweet,len
0,0,"here's to you, oppy. clinking_glasses\n\nbefor...",245
1,1,are there rivers and lakes on other worlds? yo...,293
2,2,we want to hear from you!\n\njoin our series o...,302
3,3,"the @nasaexoplanets data hint that wasp-39 b, ...",175
4,4,.@nasawebb just scored another first: a full p...,189
...,...,...,...
14105,14105,the supermoon is here! be sure to bundle up th...,174
14106,14106,ever wonder how we track supermoons full_moon ...,140
14107,14107,"a supermoon is coming! tonight, the full moon ...",255
14108,14108,happy new year from space! astronauts aboard t...,254


In [13]:
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

df["Tweet"] = df["Tweet"].apply(lambda text: remove_punctuation(text))
df

Unnamed: 0,Tweet ID,Tweet,len
0,0,heres to you oppy clinkingglasses\n\nbefore yo...,245
1,1,are there rivers and lakes on other worlds you...,293
2,2,we want to hear from you\n\njoin our series of...,302
3,3,the nasaexoplanets data hint that wasp39 b aka...,175
4,4,nasawebb just scored another first a full prof...,189
...,...,...,...
14105,14105,the supermoon is here be sure to bundle up the...,174
14106,14106,ever wonder how we track supermoons fullmoon a...,140
14107,14107,a supermoon is coming tonight the full moon wi...,255
14108,14108,happy new year from space astronauts aboard th...,254


In [14]:
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

df["Tweet"] = df["Tweet"].apply(lambda text: remove_stopwords(text))
df

Unnamed: 0,Tweet ID,Tweet,len
0,0,heres oppy clinkingglasses say goodnightoppy l...,245
1,1,rivers lakes worlds bet like earth saturn’s mo...,293
2,2,want hear join series virtual inperson meeting...,302
3,3,nasaexoplanets data hint wasp39 b aka bocaprin...,175
4,4,nasawebb scored another first full profile ato...,189
...,...,...,...
14105,14105,supermoon sure bundle lead “pack” outside view...,174
14106,14106,ever wonder track supermoons fullmoon lunar ev...,140
14107,14107,supermoon coming tonight full moon near closes...,255
14108,14108,happy new year space astronauts aboard spacest...,254
