In [1]:
import pandas as pd 
import numpy as np 
from path import Path

import matplotlib.pyplot as plt 
import re
import string

import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import words
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment.util import *
nltk.download('stopwords')
nltk.download('vader_lexicon')


from collections import Counter

from matplotlib import pyplot as plt
from matplotlib import ticker
import seaborn as sns
import plotly.express as px

sns.set(style="darkgrid")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/namirsacic/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/namirsacic/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
def csv_to_df(csv_file):
    file_path = Path(csv_file)
    df = pd.read_csv(file_path)
    
    return df

In [3]:
#filter for english tweets
def filter_tweets(df):
    df = df[df.lang == "en"]
    
    return df

In [4]:
#Drop rows that don´t contain tweets
def drop_rows(df): 
    index_names = df[ df['created_at'] == "created_at" ].index
  
    # drop these row indexes
    # from dataFrame
    df.drop(index_names, inplace = True)
    
    return df

In [5]:
needed_columns = ["created_at", "tweet"]

def get_series_of_tweets(df):
    
    df = df[needed_columns]
    df.created_at = pd.to_datetime(df.created_at).dt.date
    tweets = df["tweet"]
    
    return tweets

In [6]:
#Following function removes URLs, punctuation, stopwords and converts text into lowercase
def clean_tweets(tweets):
    #Removing URLs from tweets
    remove_url = lambda x: re.sub(r'https\S+', '', str(x))
    tweets_lr = tweets.apply(remove_url)
    tweets_lr
    
    #Convert to lowercase
    to_lower = lambda x: x.lower()
    tweets_lr_lc = tweets_lr.apply(to_lower)
    tweets_lr_lc
    
    #Removing punctuation
    remove_puncs = lambda x: x.translate(str.maketrans("","",string.punctuation))
    tweets_lr_lc_np = tweets_lr_lc.apply(remove_puncs)
    tweets_lr_lc_np
    
    #Remove stopwords
    stop_words = set(stopwords.words("English"))

    remove_words = lambda x: " ".join([word for word in x.split() if word not in stop_words])
    tweets_lr_lc_np_ns = tweets_lr_lc_np.apply(remove_words)
    
    return tweets_lr_lc_np_ns 

In [13]:
def create_csv(df, tweets, csv_name):
    df.tweet = tweets
    df.to_csv(csv_name, encoding='utf-8', index=False)

In [7]:
aapl_df = csv_to_df("../tweets/aapl_tweets.csv")
btc_df = csv_to_df("../tweets/btc_tweets.csv")
jnj_df = csv_to_df("../tweets/jnj_tweets.csv")
msft_df = csv_to_df("../tweets/msft_tweets.csv")
nflx_df = csv_to_df("../tweets/nflx_tweets.csv")
pfe_df = csv_to_df("../tweets/pfe_tweets.csv")
tsla_df = csv_to_df("../tweets/tsla_tweets.csv")
twr_df = csv_to_df("../tweets/twr_tweets.csv")

  if (await self.run_code(code, result,  async_=asy)):


In [8]:
#Drop non-english tweets
aapl_df = filter_tweets(aapl_df)
btc_df = filter_tweets(btc_df)
jnj_df = filter_tweets(jnj_df)
msft_df = filter_tweets(msft_df)
nflx_df = filter_tweets(nflx_df)
pfe_df = filter_tweets(pfe_df)
tsla_df = filter_tweets(tsla_df)
twr_df = filter_tweets(twr_df)

In [9]:
aapl_df = drop_rows(aapl_df)
btc_df = drop_rows(btc_df)
jnj_df = drop_rows(jnj_df)
msft_df = drop_rows(msft_df)
nflx_df = drop_rows(nflx_df)
pfe_df = drop_rows(pfe_df)
tsla_df = drop_rows(tsla_df)
twr_df = drop_rows(twr_df)

In [10]:
aapl_tweets = get_series_of_tweets(aapl_df)
btc_tweets = get_series_of_tweets(btc_df)
jnj_tweets = get_series_of_tweets(jnj_df)
msft_tweets = get_series_of_tweets(msft_df)
nflx_tweets = get_series_of_tweets(nflx_df)
pfe_tweets = get_series_of_tweets(pfe_df)
tsla_tweets = get_series_of_tweets(tsla_df)
twr_tweets = get_series_of_tweets(twr_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [11]:
aapl_tweets = clean_tweets(aapl_tweets)
btc_tweets = clean_tweets(btc_tweets)
jnj_tweets = clean_tweets(jnj_tweets)
msft_tweets = clean_tweets(msft_tweets)
nflx_tweets = clean_tweets(nflx_tweets)
pfe_tweets = clean_tweets(pfe_tweets)
tsla_tweets = clean_tweets(tsla_tweets)
twr_tweets = clean_tweets(twr_tweets)

In [14]:
create_csv(aapl_df, aapl_tweets, "aapl_cleaned.csv")
create_csv(btc_df, btc_tweets, "btc_cleaned.csv")
create_csv(jnj_df, jnj_tweets, "jnj_cleaned.csv")
create_csv(msft_df, msft_tweets, "msft_cleaned.csv")
create_csv(nflx_df, nflx_tweets, "nflx_cleaned.csv")
create_csv(pfe_df, pfe_tweets, "pfe_cleaned.csv")
create_csv(tsla_df, tsla_tweets, "tsla_cleaned.csv")
create_csv(twr_df, twr_tweets, "twr_cleaned.csv")