# Tweets processing with [TextBlob](https://textblob.readthedocs.io/en/dev/)

### <span style="color:#ff5f27;"> 📝 Imports</span>

In [1]:
import json
import io
import re
import time
import os.path
import math

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from datetime import timedelta, datetime
from dateutil import parser

from tqdm import tnrange, tqdm_notebook, tqdm


from textblob import TextBlob

import warnings

warnings.filterwarnings('ignore')

In [3]:
df_tweets_processed = pd.read_csv("tweets_processed.csv", index_col=0)

In [4]:
df_tweets_processed = df_tweets_processed[['text']] 
df_tweets_processed.columns = ['tweets']
df_tweets_processed.head()

Unnamed: 0_level_0,tweets
date,Unnamed: 1_level_1
2021-02-05 10:52:04,📖 Weekend Read 📖\n\nKeen to learn about crypt...
2021-02-05 10:52:04,2⃣ Debunking 9 Bitcoin Myths by ⬇️ \n\ncryp...
2021-02-05 10:52:06,4⃣ 🎙️ Bloomberg LP CryptoOutlook 2021 with ⬇️...
2021-02-05 10:52:07,"5⃣ Blockchain 50 2021 by , , ⬇️\n\ncryptocur..."
2021-02-05 10:52:26,reddcoin rdd to the moon altcoin turnreddcoini...


## <span style='color:#ff5f27'>🧹 Additional text cleaning </span>

In [11]:
def timestamp_2_time(x):
    dt_obj = datetime.datetime.strptime(str(x), '%Y-%m-%d %H:%M:%S')
    dt_obj = dt_obj.timestamp() * 1000
    return int(dt_obj)

In [12]:
def clean_text2(df):
    """Second cleaning using 'nltk' module. Processes 'text' feature. """
    
    stop_words = nltk.corpus.stopwords.words(['english'])
    lem = WordNetLemmatizer()

    def cleaning(data):
        # remove urls
        tweet_without_url = re.sub(r'http\S+',' ', data)

        # remove hashtags
        tweet_without_hashtag = re.sub(r'#\w+', ' ', tweet_without_url)

        # Remove mentions and characters that not in the English alphabets
        tweet_without_mentions = re.sub(r'@\w+',' ', tweet_without_hashtag)
        precleaned_tweet = re.sub('[^A-Za-z]+', ' ', tweet_without_mentions)

        # Tokenize
        tweet_tokens = TweetTokenizer().tokenize(precleaned_tweet)

        # Remove Puncs
        tokens_without_punc = [w for w in tweet_tokens if w.isalpha()]

        # Removing Stopwords
        tokens_without_sw = [t for t in tokens_without_punc if t not in stop_words]

        # lemma
        text_cleaned = [lem.lemmatize(t) for t in tokens_without_sw]

        # Joining
        return " ".join(text_cleaned)
    
    df['cleaned_tweets'] = df['text'].apply(cleaning)
    
    return df

## <span style='color:#ff5f27'>🤖 TextBlob applying </span>

In [13]:
def textblob_processing(df):
    """
    Applies TextBlob sentiment analisys to 'cleaned_tweets' feature in the DataFrame df
    """
    df = clean_text2(df)
    
    def getSubjectivity(tweet):
        return TextBlob(tweet).sentiment.subjectivity

    def getPolarity(tweet):
        return TextBlob(tweet).sentiment.polarity
    
    correct_dates = df['date'].copy()
    df['date'] = pd.to_datetime(df['date']).dt.strftime('%Y-%m-%d')
    df.cleaned_tweets = df.cleaned_tweets.astype(str)
    
    df['subjectivity'] = df['cleaned_tweets'].apply(getSubjectivity)
    df['polarity'] = df['cleaned_tweets'].apply(getPolarity)
    
    df.date = correct_dates
    df.date = pd.to_datetime(df.date)
    df = df.set_index("date")
    df = df.resample('1D').sum()
    df = df[["subjectivity", "polarity"]].reset_index()
    
    df['date'] = df['date'].dt.strftime('%Y-%m-%d %H:%M:%S')
    df['unix'] = df.date.apply(timestamp_2_time)
    
    return df

In [14]:
tweets_textblob = textblob_processing(df_tweets_processed)

## <span style='color:#ff5f27'> 📥 Save the results</span>

In [15]:
tweets_textblob.to_csv("tweets_textblob.csv")