# Natural Language Processing

In this section, we use unsupervised learning, utlizing the genism package, where we find topics in our twitter crypto dataset.

In [None]:
#importing necessary packages
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk import WordNetLemmatizer
from gensim.models.phrases import Phrases, Phraser

In [2]:
data_1 = pd.read_csv('data/crypto_tweets_pts.csv')
data_2 = pd.read_csv('data/crypto_tweets.csv')

In [3]:
#combining and reconfinguring twitter datasets
df = pd.concat([data_1, data_2], axis=0).drop(columns = ['Unnamed: 0']).reset_index()
df.drop(columns=['index'], inplace=True)
df['Datetime'] = pd.to_datetime(df.Datetime)
df.set_index('Datetime', inplace=True)

In [4]:
#created stopwords list
stop_words = stopwords.words('english')

#cleans tweets
def remove(text):

    text = text.lower()
    text = re.sub(r"@[a-z0-9_]+|#[a-z0-9_]+|http\S+", "", text).strip().replace("\r", "").replace("\n", "").replace("\t", "").replace("/xa0", "")

    return text

#tokenizes tweets
def tokenizing(text):

    return [x for x in word_tokenize(text) if ((x.isalpha()) & (x not in stop_words))]

#lemmaties tweets
def lemma(text, min_length):

    wnl = WordNetLemmatizer()

    def pos_tagger(nltk_tag):
        if nltk_tag.startswith('J'):
            return wordnet.ADJ
        elif nltk_tag.startswith('V'):
            return wordnet.VERB
        elif nltk_tag.startswith('N'):
            return wordnet.NOUN
        elif nltk_tag.startswith('R'):
            return wordnet.ADV
        else:         
            return None

    wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tag(text))) 

    if len(wordnet_tagged) <= min_length:
        return ''
    else:
         text = " ".join([wnl.lemmatize(x[0], x[1]) for x in wordnet_tagged if x[1] is not None])
         return text

In [5]:
#tokenizing tweets
df['tokens'] = df.Text.apply(lambda x: tokenizing(remove(x)))

## Preprocessing

We use Phrases from genism to create bigrams and trigrams to group the tweets. Then we lemmatize on those sets of bigrams and trigrams.

In [6]:
#bigram and trigram models
bigram = Phrases(df.tokens, min_count=5, threshold=100)
trigram = Phrases(bigram[df.tokens], threshold=100)

bigram_mod = Phraser(bigram)
trigram_mod = Phraser(trigram)

In [7]:
#lemmatizing bigram and trigrams to create columns
df['lemma_words_bigram'] = df.tokens.apply(lambda x: lemma(bigram_mod[x], 10))
df['lemma_words_trigram'] = df.tokens.apply(lambda x: lemma(trigram_mod[x], 10))

In [8]:
#seperating dataframes by lemmatized bigrams
bigram = df[df.lemma_words_bigram != '']
bigram

Unnamed: 0_level_0,Text,Username,Reply_Count,Amt_of_Likes,Amt_of_Retweets,Amt_of_Quotes,tokens,lemma_words_bigram,lemma_words_trigram
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2021-12-18 23:31:21+00:00,Come listen in with @ThorWallet on the next #M...,SushiSwap,47,75,18,1,"[come, listen, next, ama, talk, upcoming, defi...",come listen next ama talk upcoming defi wallet...,come listen next ama talk upcoming defi wallet...
2021-12-13 08:25:29+00:00,Samurais: https://t.co/QhvR0Cyllk\nArca: https...,SushiSwap,33,96,13,2,"[samurais, arca, frog, nation, look, forward, ...",samurai arca frog nation look forward see prop...,samurai arca frog nation look forward see prop...
2021-12-13 08:25:29+00:00,"long live Sushi \n\nover the past one month, w...",SushiSwap,56,400,59,5,"[long, live, sushi, past, one, month, received...",long live sushi month receive overwhelming sup...,long live sushi month receive overwhelming sup...
2021-12-12 00:58:17+00:00,$PSP launching a 3 months liquidity mining pro...,SushiSwap,31,155,29,0,"[psp, launching, months, liquidity, mining, pr...",psp launch month liquidity mining program sush...,psp launch month liquidity mining program sush...
2021-12-05 22:09:21+00:00,"Sushi community, \n\nThis is our official stat...",SushiSwap,86,284,46,10,"[sushi, community, official, statement, respon...",sushi community official statement response re...,sushi community official statement response re...
...,...,...,...,...,...,...,...,...,...
2022-01-03 09:34:06+00:00,@t_b_b_s Please enter those as a trade from on...,Coin_Tracking,0,1,0,0,"[please, enter, trade, one, coin, trade, group...",please enter trade coin trade group swap carry...,please enter trade coin trade group swap carry...
2022-01-02 09:10:33+00:00,@0xScissus Some people actually want to see th...,Coin_Tracking,1,0,0,0,"[people, actually, want, see, coins, account, ...",people actually want see coin account work avo...,people actually want see coin account work avo...
2022-01-01 15:19:05+00:00,@IHeartCrypto Please enter it as Spend (as all...,Coin_Tracking,1,1,0,0,"[please, enter, spend, purchases, goods, servi...",please enter spend purchase goods_services cry...,please enter spend purchase goods_services cry...
2022-01-01 10:35:50+00:00,@hodl_avax Could you please open a support tic...,Coin_Tracking,0,0,0,0,"[could, please, open, support, ticket, example...",please open support ticket example help fix al...,please open support_ticket example help fix al...


In [9]:
#seperating dataframe by lemmatized trigrams
trigram = df[df.lemma_words_trigram != '']
trigram

Unnamed: 0_level_0,Text,Username,Reply_Count,Amt_of_Likes,Amt_of_Retweets,Amt_of_Quotes,tokens,lemma_words_bigram,lemma_words_trigram
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2021-12-18 23:31:21+00:00,Come listen in with @ThorWallet on the next #M...,SushiSwap,47,75,18,1,"[come, listen, next, ama, talk, upcoming, defi...",come listen next ama talk upcoming defi wallet...,come listen next ama talk upcoming defi wallet...
2021-12-13 08:25:29+00:00,Samurais: https://t.co/QhvR0Cyllk\nArca: https...,SushiSwap,33,96,13,2,"[samurais, arca, frog, nation, look, forward, ...",samurai arca frog nation look forward see prop...,samurai arca frog nation look forward see prop...
2021-12-13 08:25:29+00:00,"long live Sushi \n\nover the past one month, w...",SushiSwap,56,400,59,5,"[long, live, sushi, past, one, month, received...",long live sushi month receive overwhelming sup...,long live sushi month receive overwhelming sup...
2021-12-12 00:58:17+00:00,$PSP launching a 3 months liquidity mining pro...,SushiSwap,31,155,29,0,"[psp, launching, months, liquidity, mining, pr...",psp launch month liquidity mining program sush...,psp launch month liquidity mining program sush...
2021-12-05 22:09:21+00:00,"Sushi community, \n\nThis is our official stat...",SushiSwap,86,284,46,10,"[sushi, community, official, statement, respon...",sushi community official statement response re...,sushi community official statement response re...
...,...,...,...,...,...,...,...,...,...
2022-01-03 09:34:06+00:00,@t_b_b_s Please enter those as a trade from on...,Coin_Tracking,0,1,0,0,"[please, enter, trade, one, coin, trade, group...",please enter trade coin trade group swap carry...,please enter trade coin trade group swap carry...
2022-01-02 09:10:33+00:00,@0xScissus Some people actually want to see th...,Coin_Tracking,1,0,0,0,"[people, actually, want, see, coins, account, ...",people actually want see coin account work avo...,people actually want see coin account work avo...
2022-01-01 15:19:05+00:00,@IHeartCrypto Please enter it as Spend (as all...,Coin_Tracking,1,1,0,0,"[please, enter, spend, purchases, goods, servi...",please enter spend purchase goods_services cry...,please enter spend purchase goods_services cry...
2022-01-01 10:35:50+00:00,@hodl_avax Could you please open a support tic...,Coin_Tracking,0,0,0,0,"[could, please, open, support, ticket, example...",please open support ticket example help fix al...,please open support_ticket example help fix al...


In [10]:
#saving dataframes
trigram.to_csv('data/trigram.csv')
bigram.to_csv('data/bigram.csv')