# Part 2 - Preprocessing Tweets

## 1. Importing Libraries
### 1.1 Libraries

In [2]:
pip install findspark

Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1
Note: you may need to restart the kernel to use updated packages.


In [4]:
#Initializing pandas & numpy
import pandas as pd
import numpy as np

# Text Pre-processing libraries
import nltk
import re
from nltk.tokenize import word_tokenize
from string import punctuation 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer,PorterStemmer
# Downloading from the collection
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


#Spark Libraries
import findspark
from pyspark import SparkContext
from pyspark.sql import SparkSession, Window, Row
from pyspark.sql.functions import *
from pyspark.sql.types import *

# Emoji library
import emoji

#Data visualization 
from matplotlib import pyplot as plt

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Neu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Neu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Neu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


ModuleNotFoundError: No module named 'pyspark'

In [6]:
# Spark Session
spark = SparkSession.builder.appName("Spark").getOrCreate()

### 1.2 Configurations

In [7]:
# Directory to store the tweets data after analysis & pre-processing
tweet_df_path = "../Data/tweets_data.csv"
processed_tweet_df_path = "../Data/processed_tweet_data.csv" 

## 2. Loading Tweets

In [8]:
# Loading the data
tweet_df = spark.read.format('csv').options(header='true').option("mode", "DROPMALFORMED").load(tweet_df_path)

## 3. Analysing Data

In [9]:
# previewing the df table
tweet_df.limit(5).toPandas()

Unnamed: 0,tweet,date,author,hashtags,followers_count,friends_count,coordinates,retweet_count,favorite_count
0,@SpyWolfNetwork $SPY #spywolf #bnb #bscgem #bi...,2022-02-22 12:06:03+00:00,Hamed58035768,"['spywolf', 'bnb', 'bscgem', 'binance', 'bitco...",108,49,,0,0
1,Investor's Kit for your daily investment needs...,2022-02-22 12:06:03+00:00,btcbitcoinnews,"['Bitcoin', 'ETH', 'ethereum', 'Ripple', 'cryp...",153,216,,0,0
2,"14/""",2022-02-22 12:06:02+00:00,eCashInformer,"['PoW', 'blockchain', 'Bitcoin']",2921,89,,0,0
3,Bitcoin could be laid low by miners' malady - ...,2022-02-22 12:06:02+00:00,JohnLothian,[],9857,3511,,0,0
4,Ray Dalio: Allocating up to 2% of your portfol...,2022-02-22 12:06:02+00:00,acoindetective,[],11500,2355,,0,0


In [10]:
#Verifying the schema of the data table
tweet_df.printSchema()

root
 |-- tweet: string (nullable = true)
 |-- date: string (nullable = true)
 |-- author: string (nullable = true)
 |-- hashtags: string (nullable = true)
 |-- followers_count: string (nullable = true)
 |-- friends_count: string (nullable = true)
 |-- coordinates: string (nullable = true)
 |-- retweet_count: string (nullable = true)
 |-- favorite_count: string (nullable = true)



In [11]:
#Converting spark df to pandas
tweet_df = tweet_df.toPandas()

#NaN values removal
tweet_df = tweet_df[tweet_df["tweet"].notna()]

In [12]:
# Adding index number 
tweet_df.index.names = ['Index_number']

In [13]:
#Viewing the data
tweet_df

Unnamed: 0_level_0,tweet,date,author,hashtags,followers_count,friends_count,coordinates,retweet_count,favorite_count
Index_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,@SpyWolfNetwork $SPY #spywolf #bnb #bscgem #bi...,2022-02-22 12:06:03+00:00,Hamed58035768,"['spywolf', 'bnb', 'bscgem', 'binance', 'bitco...",108,49,,0,0
1,Investor's Kit for your daily investment needs...,2022-02-22 12:06:03+00:00,btcbitcoinnews,"['Bitcoin', 'ETH', 'ethereum', 'Ripple', 'cryp...",153,216,,0,0
2,"14/""",2022-02-22 12:06:02+00:00,eCashInformer,"['PoW', 'blockchain', 'Bitcoin']",2921,89,,0,0
3,Bitcoin could be laid low by miners' malady - ...,2022-02-22 12:06:02+00:00,JohnLothian,[],9857,3511,,0,0
4,Ray Dalio: Allocating up to 2% of your portfol...,2022-02-22 12:06:02+00:00,acoindetective,[],11500,2355,,0,0
...,...,...,...,...,...,...,...,...,...
4658,"#LegitCommunity #Airdrop #MetaUni #Bitcoin""",2022-02-22 10:27:43+00:00,tutulbd7773,"['LegitCommunity', 'Airdrop', 'MetaUni', 'Bitc...",108,1059,,0,0
4659,"#bitcoin #Crypto #cryptocurrencies #CryptoNews""",2022-02-22 10:27:42+00:00,areeb0336,"['btc', 'bitcoin', 'Crypto', 'cryptocurrencies...",32,53,,0,0
4660,#Ethereum #ETH #BTC #Bitcoin #Metaverse #PlayT...,2022-02-22 10:27:39+00:00,m_almanasrah,"['Ethereum', 'ETH', 'BTC', 'Bitcoin', 'Metaver...",1997,163,,0,4
4661,#Cryptocurrency $BTC $etc https://t.co/uSsLjuH...,2022-02-22 10:27:39+00:00,AgeingTrex,"['Bitcoin', 'Ethereum', 'Crypto', 'Cryptocurre...",342,0,,0,0


## 4. Cleaning Tweets

In [11]:
#Initializing Lemm & Stem
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer() 

#setting the english stopwords into the var stop_words + punctual
stop_words = set(stopwords.words('english')+ list(punctuation))

# Function to pre-process tweets
def process_tweets(text):
    """
    This function cleans the tweets by stripping off URLs, miscellaneous characters. 
    Then the tweets are tokenized, stemmed, and lemmatized. 
    
    Parameters:
    text: The tweet.
    Retrun: The processed tweet.
    """
   # Text cleaning
    text = text.lower()
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', ' ', text)
    text = re.sub(r'[^\x00-\x7F]+',' ', text)
    text = re.sub('@[^\s]+', ' ', text)
    text = emoji.demojize(text)
    text = word_tokenize(text)
    text = [word for word in text if word not in stop_words]
    text = [stemmer.stem(w) for w in text]
    text = [lemmatizer.lemmatize(w) for w in text]

    #joining every word in the text by a space
    return " ".join(text)


# Inside the tweet column, cleaned tweet(text)applied
tweet_df.tweet = tweet_df.tweet.apply(process_tweets)

#creating csv in the "processed_tweet_df_path" path 
tweet_df.to_csv(processed_tweet_df_path, index=False)