# Imports

In [1]:
import pandas as pd 
import numpy as np 
import nltk                                # a Python NLP library  
import matplotlib.pyplot as plt            # library for visualization
import random                              # pseudo-random number generator

# Load Data

In [2]:
data_food = pd.read_csv('food_usa.csv')
data_politics = pd.read_csv('politics_usa.csv')
data_magazine = pd.read_csv('magazine_usa.csv')
data_sport = pd.read_csv('sport_usa.csv')
data_technology = pd.read_csv('technology_usa.csv')

data_politics.head()

Unnamed: 0.1,Unnamed: 0,tweet_id,text,source
0,0,1515056253793447944,"Scott Pruitt, Trump's former EPA chief, is ru...",politico
1,1,1515040871418261506,RT @politicongress: A joint fundraising commit...,politico
2,2,1515017655182409731,"From their earliest days, Black churches have ...",politico
3,3,1515011952237588485,"RT @playbookdc: In early November, Rep. Chip R...",politico
4,4,1515002027079872528,Add West Wing Playbook to your daily reads for...,politico


In [3]:
data_fitness = pd.read_csv('fitnes-health_usa.csv')

## Looking at raw-tweets 

Before anything else, we can print a couple of tweets from our datasets to see how they look. 

In [4]:
print("POLITCS:", data_politics.loc[random.randint(0,4500)].text)

POLITCS: Macron warns of "escalation of rhetoric" after Biden "genocide" comment https://t.co/in2JitJDxt https://t.co/kKWZumU8I2


In [5]:
print("FOOD:", data_food.loc[random.randint(0,4500)].text)

FOOD: “Burnt” seems poised to become one more tool brands can use to make it always seem like they’re coming up with something new https://t.co/n31jyb6fZn


In [6]:
print("MAGAZINE:", data_magazine.loc[random.randint(0,4500)].text)

MAGAZINE: BREAKING: Will Smith has responded to the Academy of Motion Pictures Arts and Sciences’ decision to ban him from all Oscars events for 10 years. https://t.co/Al35k9PR77


In [7]:
print("TECHNOLOGY:", data_technology.loc[random.randint(0,4500)].text)

TECHNOLOGY: Apple turns monitor height adjustment into a $400 upsell https://t.co/kPoMJv5MOp https://t.co/m9YGAPvu4j


In [8]:
print("SPORT:", data_sport.loc[random.randint(0,4500)].text)

SPORT: RT @FOXBetLive: World Series Picks ⚾️ 🏆

@spshoot: Dodgers (+500)
@BenVerlander: Blue Jays (+900)

Which bet are you taking? https://t.co/f…


# Preprocess raw text for Sentiment analysis

Data preprocessing is one of the critical steps in any machine learning project. It includes cleaning and formatting the data before feeding into a machine learning algorithm. For NLP, the preprocessing steps are comprised of the following tasks:

- Tokenizing the string
- Lowercasing
- Removing stop words and punctuation
- Stemming

In [9]:
tweet = data_magazine.loc[5].text 
tweet

'Boost your hair care routine! #Scream star #JennaOrtega revealed her go-to beauty regimen for her luscious locks. Shop her favorite products now!\nhttps://t.co/uwvRvZFI94'

In [10]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\melih\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
import re                                  # library for regular expression operations
import string                              # for string operations

from nltk.corpus import stopwords          # module for stop words that come with NLTK
from nltk.stem import PorterStemmer        # module for stemming
from nltk.tokenize import TweetTokenizer   # module for tokenizing strings

# Remove hyperlinks, Twitter marks and styles

In [12]:
# remove old style retweet text "RT"
tweet2 = re.sub(r'^RT[\s]+', '', tweet)

# remove hyperlinks
tweet2 = re.sub(r'https?://[^\s\n\r]+', '', tweet2)

# remove hashtags
# only removing the hash # sign from the word
tweet2 = re.sub(r'#', '', tweet2)

tweet2

'Boost your hair care routine! Scream star JennaOrtega revealed her go-to beauty regimen for her luscious locks. Shop her favorite products now!\n'

In [13]:
tweet

'Boost your hair care routine! #Scream star #JennaOrtega revealed her go-to beauty regimen for her luscious locks. Shop her favorite products now!\nhttps://t.co/uwvRvZFI94'

# Tokenize the string
To tokenize means to split the strings into individual words without blanks or tabs. In this same step, we will also convert each word in the string to lower case.

In [14]:
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)

In [15]:
tweet_tokens = tokenizer.tokenize(tweet2)

print(tweet2)
print(tweet_tokens)

Boost your hair care routine! Scream star JennaOrtega revealed her go-to beauty regimen for her luscious locks. Shop her favorite products now!

['boost', 'your', 'hair', 'care', 'routine', '!', 'scream', 'star', 'jennaortega', 'revealed', 'her', 'go-to', 'beauty', 'regimen', 'for', 'her', 'luscious', 'locks', '.', 'shop', 'her', 'favorite', 'products', 'now', '!']


### Remove Stop Words and Punctuations

The next step is to remove stop words and punctuation. Stop words are words that don't add significant meaning to the text. 
You'll see them below. 

In [16]:
stopwords_english = stopwords.words('english') 

print('Stop words\n')
print(stopwords_english)

print('\nPunctuation\n')
print(string.punctuation)

Stop words

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so

In [17]:
tweets_clean = []

for word in tweet_tokens: 
    if (word not in stopwords_english and word not in string.punctuation): 
        tweets_clean.append(word)

print('removed stop words and punctuation:')
print(tweets_clean)

removed stop words and punctuation:
['boost', 'hair', 'care', 'routine', 'scream', 'star', 'jennaortega', 'revealed', 'go-to', 'beauty', 'regimen', 'luscious', 'locks', 'shop', 'favorite', 'products']


### Stemming

Stemming is the process of converting a word to its most general form, or stem. This helps in reducing the size of our vocabulary.

Consider the words: 
 * **learn**
 * **learn**ing
 * **learn**ed
 * **learn**t

In [18]:
stemmer = PorterStemmer() 

In [19]:
tweets_stem = [] 

for word in tweets_clean:
    stem_word = stemmer.stem(word)  # stemming word
    tweets_stem.append(stem_word)  # append to the list

print('stemmed words:')
print(tweets_stem)

stemmed words:
['boost', 'hair', 'care', 'routin', 'scream', 'star', 'jennaortega', 'reveal', 'go-to', 'beauti', 'regimen', 'lusciou', 'lock', 'shop', 'favorit', 'product']


# Create A Function That do pre-processing tweets one by one 

In [20]:
def process_tweet(tweet):
    """Process tweet function.
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet
    """
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    tweet = re.sub(r'\$\w*', '', tweet)
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    tweet = re.sub(r'https?://[^\s\n\r]+', '', tweet)
    tweet = re.sub(r'#', '', tweet)
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean

In [21]:
process_tweet(tweet)

['boost',
 'hair',
 'care',
 'routin',
 'scream',
 'star',
 'jennaortega',
 'reveal',
 'go-to',
 'beauti',
 'regimen',
 'lusciou',
 'lock',
 'shop',
 'favorit',
 'product']

In [22]:
process_tweet(data_food.loc[2].text)

['muffin', 'tin', 'great', 'meal', 'prep', 'individu', 'serv']

# Data Pre-Processing

In [23]:
data_food.head() 

Unnamed: 0.1,Unnamed: 0,tweet_id,text,source
0,0,1516326752834002944,A reminder that delivery app restaurants aren'...,foodandwine
1,1,1516318179894640641,"Braised chicken legs, soy sauce, vinegar, blac...",foodandwine
2,2,1516311648889417728,Muffin tins are great for meal prepping indivi...,foodandwine
3,3,1516296543791456260,We tested everything from spicy dill pickle fl...,foodandwine
4,4,1516290031522127876,Is eating a regular Big Mac the secret to a lo...,foodandwine


In [24]:
data_food_processed = np.array([process_tweet(item) for item in data_food['text'].tolist()], dtype=object)

In [25]:
data_food_processed.shape 

(4881,)

In [26]:
data_food_processed[0]

['remind', 'deliveri', 'app', 'restaur', 'alway', 'seem']

In [27]:
data_food_processed[2]

['muffin', 'tin', 'great', 'meal', 'prep', 'individu', 'serv']

In [28]:
data_food_processed[5]

['four', 'ingredi', 'one', 'bowl', 'five', 'minut', 'stir', 'togeth']

In [29]:
data_magazine_processed = np.array([process_tweet(item) for item in data_magazine['text'].tolist()], dtype=object)
data_technolohy_processed = np.array([process_tweet(item) for item in data_technology['text'].tolist()], dtype=object)
data_politics_processed = np.array([process_tweet(item) for item in data_politics['text'].tolist()], dtype=object)
data_sport_processed = np.array([process_tweet(item) for item in data_sport['text'].tolist()], dtype=object)

In [30]:
data_fitness_processed = np.array([process_tweet(item) for item in data_fitness['text'].tolist()], dtype=object)

In [31]:
data_magazine_processed[2]

['internet',
 'troll',
 'field',
 'day',
 "jadensmith'",
 'expens',
 "willsmith'",
 'son',
 'butt',
 'joke',
 'say',
 'like',
 'talk',
 'topic']

In [32]:
data_politics_processed[3]

['earli',
 'novemb',
 'rep',
 'chip',
 'roy',
 'text',
 'mark',
 'meadow',
 'say',
 '“',
 'need',
 'ammo',
 '”',
 'former',
 'presid',
 'donald',
 "trump'",
 'effort',
 '…']

In [33]:
data_fitness_processed[2]

['time',
 'get',
 'rockin',
 'foam',
 'rollin',
 'last',
 'time',
 'show',
 'way',
 'use',
 'foam',
 'roller',
 'massag',
 'sore',
 'muscl',
 'use',
 'versatil',
 'tool',
 'stretch',
 'far',
 'beyond',
 'whether',
 'pre',
 'post-workout',
 'stretch',
 'time',
 'feel',
 'differ',
 'foam',
 'make']

# Data Preparation 

![alt text](data.png "Title")


In [42]:
data_iterative = {'fitness' : data_fitness_processed, 
                  'food' : data_food_processed, 
                  'magazine' : data_magazine_processed, 
                  'politics' : data_politics_processed, 
                  'sport' : data_sport_processed, 
                  'technology' : data_technolohy_processed}

In [43]:
len(data_iterative)

6

In [92]:
all_data = {}
for key,item in data_iterative.items(): 
    print("-----------------", key ,"-----------------------")
    actual = item 
    print("actual:", key, len(item))
    
    for key_in, item_in in data_iterative.items(): 
        if key_in == key: 
            continue 
        
        actual = np.concatenate((actual, item_in[:1000]))
        print("rest ", key_in, len(actual))
    
    all_data[key] = {'set' : actual, 'actual_size' : len(item)}
    

----------------- fitness -----------------------
actual: fitness 4665
rest  food 5665
rest  magazine 6665
rest  politics 7665
rest  sport 8665
rest  technology 9665
----------------- food -----------------------
actual: food 4881
rest  fitness 5881
rest  magazine 6881
rest  politics 7881
rest  sport 8881
rest  technology 9881
----------------- magazine -----------------------
actual: magazine 4954
rest  fitness 5954
rest  food 6954
rest  politics 7954
rest  sport 8954
rest  technology 9954
----------------- politics -----------------------
actual: politics 4968
rest  fitness 5968
rest  food 6968
rest  magazine 7968
rest  sport 8968
rest  technology 9968
----------------- sport -----------------------
actual: sport 4829
rest  fitness 5829
rest  food 6829
rest  magazine 7829
rest  politics 8829
rest  technology 9829
----------------- technology -----------------------
actual: technology 4976
rest  fitness 5976
rest  food 6976
rest  magazine 7976
rest  politics 8976
rest  sport 9976


In [94]:
all_data['fitness']

{'set': array([list(['know', 'effect', 'exercis', 'leg', 'tricki', 'design', 'quick', 'effect', '15', 'min', 'workout', 'knock', 'park', 'next', 'leg', 'day', 'one', 'els', 'back', 'shoulder', 'leg', 'like', 'us', '💯']),
        list(['stress', '😩', 'tire', '😴', 'worn', '😰', 'common', "they'r", 'perfect', 'reason', 'come', 'gym', 'work', 'us', 'promis', 'leav', 'feel', 'littl', 'relax', 'energ', 'readi', 'tackl', "what'", 'ahead', "that'", 'realaf']),
        list(['time', 'get', 'rockin', 'foam', 'rollin', 'last', 'time', 'show', 'way', 'use', 'foam', 'roller', 'massag', 'sore', 'muscl', 'use', 'versatil', 'tool', 'stretch', 'far', 'beyond', 'whether', 'pre', 'post-workout', 'stretch', 'time', 'feel', 'differ', 'foam', 'make']),
        ...,
        list(['meet', 'grow', 'founder', 'network', 'small', 'group', 'discuss', 'techcrunch', 'earli', 'stage']),
        list(['microsoft', 'want', 'build', 'next', 'game', 'cloud']),
        list(['onepointon', 'want', 'piec', 'vertic', 'farm',

In [95]:
for key, item in all_data.items(): 
    pd.DataFrame(data=item['set'], columns=[key]).to_csv(key+f"_[actual_size={item['actual_size']}]_processed_merge.csv") 