The following steps need to be carried out in order to pre-process the data for topic modelling or any type of text/sentiment/topic-modelling analysis


1. Convert all text to lowercase
2. Remove URLs, twitter handles, punctuations and remove punctuations
3. Remove stopwords
4. Lemmatization (No stemming, why?, eg. computer becomes comput)

The idea is to perform all of this using Spark or Python.

This notebook does this on the twitter data and instagram data.

In [None]:
import sys
import findspark
from pyspark.sql import SparkSession

findspark.init()

spark = SparkSession.builder.appName('example application').getOrCreate()
assert sys.version_info >= (3, 4) # make sure we have Python 3.4+
assert spark.version >= '2.2' # make sure we have Spark 2.2+

Clean Twitter data

In [None]:
from pyspark.sql.types import StructField, StructType, StringType, ArrayType, IntegerType
from pyspark.sql.functions import lower, col, udf
from pyspark.sql import Row
from nltk.corpus import stopwords
import nltk
from os import listdir
from os.path import isfile, join
import json
import re

twitter_words = ['https','co','com','twitter','rt']

extended_stopwords = [word.strip() for word in open('g10000.txt')]

fields = ['screen_name', 'tweet_text', 'hashtags', 'favorites', 'retweet_count']
schema =  StructType([
    StructField('screen_name', StringType(), True),
    StructField('tweet_text', StringType(), True),
    StructField('hashtags', ArrayType(StringType()), True),
    StructField('favorites', IntegerType(), True),
    StructField('retweet_count', IntegerType(), True),
])

# Function to do the following things: Strips off URLs, punctuations, 
# User Names and Any non alphanumeric characters.
# Separates words with a single space.

def clean_tweets(text):
    cleaned = re.sub("(@[A-Za-z0-9_]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ", text).split()
    cleaned_ws = [word for word in cleaned if word not in stopwords and word not in twitter_words \
                 and word not in extended_stopwords]
    cleaned_wl = [lemma.lemmatize(word) for word in cleaned_ws]
    
    cleaned_wl = ' '.join(cleaned_ws)    
    return cleaned_wl.strip()

stopwords = stopwords.words('english')
lemma = nltk.wordnet.WordNetLemmatizer()

cleaner = udf(clean_tweets, StringType())

path = 'twitter-data'
twitter_files = [f for f in listdir(path) if isfile(join(path, f)) and (not f.startswith('.'))]

for twitter_file in twitter_files:    
    df = spark.read.json('twitter-data/{}'.format(twitter_file), schema=schema)
    tweet_df = df.select(lower(col('tweet_text')).alias('tweet_text_lower'))
    tweet_df_cleaned = tweet_df.select(cleaner('tweet_text_lower').alias('cleaned_tweet'))

    op_file_name = 'twitter-data/cleaned-ext/{}'.format(twitter_file)
    tweet_df_cleaned.write.text(op_file_name)

Clean Instagram Data

In [None]:
from nltk.corpus import stopwords

instagram_words = ['instagram', 'ig', 'https', 'http', 'tbt'] #Need to add more if required

extended_stopwords = [word.strip() for word in open('g10000.txt')]

fields = ['twitter_handle', 'instagram_handle', 'likes', 'comments',
         'hashtags', 'caption', 'timestamp', 'image_thumbnail']

schema =  StructType([
    StructField('twitter_handle', StringType(), True),
    StructField('instagram_handle', StringType(), True),
    StructField('likes', IntegerType(), True),
    StructField('comments', IntegerType(), True),
    StructField('hashtags', ArrayType(StringType()), True),
    StructField('caption', StringType(), True),
    StructField('timestamp', StringType(), True),
    StructField('image_thumbnail', StringType(), True),
])

def clean_captions(text):
    if text:
        cleaned = re.sub("(@[A-Za-z0-9_]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ", text).split()
        cleaned_ws = [word for word in cleaned if word not in stopwords and word not in instagram_words \
                     and not word in extended_stopwords]
        cleaned_wl = [lemma.lemmatize(word) for word in cleaned_ws]

        cleaned_wl = ' '.join(cleaned_ws) 
        return cleaned_wl.strip()
    else:
        return None
    
stopwords = stopwords.words('english')
lemma = nltk.wordnet.WordNetLemmatizer()

cleaner = udf(clean_captions, StringType())

path = 'instagram-data'
instagram_files = [f for f in listdir(path) if isfile(join(path, f)) and (not f.startswith('.'))]

for instagram_file in instagram_files:
    print('Working on file: {}'.format(instagram_file))
    df = spark.read.json('instagram-data/{}'.format(instagram_file), schema=schema)
    ig_df = df.select(lower(col('caption')).alias('caption_lower'))
    ig_df_cleaned = ig_df.select(cleaner('caption_lower').alias('cleaned_caption'))

    op_file_name = 'instagram-data/cleaned-ext/{}'.format(instagram_file)
    ig_df_cleaned.write.text(op_file_name)

Clean Facebook data

In [None]:
from nltk.corpus import stopwords

facebook_words = ['facebook', 'fb', 'https', 'http'] #Need to add more if required
fields = ['twitter_handle', 'fb_handle', 'fb_name', 'fb_no_of_comments',
         'fb_time_created', 'fb_description', 'fb_post_link', 'fb_img_link'
         'fb_shares', 'fb_type']

schema =  StructType([
    StructField('twitter_handle', StringType(), True),
    StructField('fb_handle', StringType(), True),
    StructField('fb_name', StringType(), True),
    StructField('fb_no_of_comments', IntegerType(), True),
    StructField('fb_time_created', StringType(), True),
    StructField('fb_description', StringType(), True),
    StructField('fb_post_link', StringType(), True),
    StructField('fb_img_link', StringType(), True),
    StructField('fb_shares', IntegerType(), True),
    StructField('fb_type', StringType(), True),
])

def clean_description(text):
    if text:
        cleaned = re.sub("(@[A-Za-z0-9_]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ", text).split()
        cleaned_ws = [word for word in cleaned if word not in stopwords and word not in facebook_words \
                     and word not in extended_stopwords]
        cleaned_wl = [lemma.lemmatize(word) for word in cleaned_ws]

        cleaned_wl = ' '.join(cleaned_ws) 
        return cleaned_wl.strip()
    else:
        return None
    
stopwords = stopwords.words('english')
lemma = nltk.wordnet.WordNetLemmatizer()

cleaner = udf(clean_description, StringType())

path = 'facebook-data'
facebook_files = [f for f in listdir(path) if isfile(join(path, f)) and (not f.startswith('.'))]

for facebook_file in facebook_files:
    print('Working on file: {}'.format(facebook_file))
    df = spark.read.json('facebook-data/{}'.format(facebook_file), schema=schema)
    fb_df = df.select(lower(col('fb_description')).alias('fb_description_lower'))
    fb_df_cleaned = fb_df.select(cleaner('fb_description_lower').alias('cleaned_description'))

    op_file_name = 'facebook-data/cleaned-ext/{}'.format(facebook_file)
    fb_df_cleaned.write.text(op_file_name)

In [None]:
from nltk.corpus import stopwords
# stopwords.stopwords('english')
stopwords.words('english')

Clean YouTube data

In [None]:
from pyspark.sql.types import StructField, StructType, StringType, ArrayType, IntegerType
from pyspark.sql.functions import lower, col, udf
from pyspark.sql import Row
from nltk.corpus import stopwords
import nltk
import pandas as pd
from os import listdir, makedirs
from os.path import isfile, join, exists
import json
import re

all_data = pd.read_csv('influencer_list.csv', sep=',')
print(all_data.shape)

sc = spark.sparkContext

def get_handle(url):
    splits = url.split('/')
    if url:
        if url.endswith('/'):
            handle = splits[-2]
        else:
            handle = splits[-1]
    else:
        handle = None    
    return handle


category_dict = dict()
for index, row in all_data.iterrows():
    handle = get_handle(row['Twitter'])
    category_dict[handle] = str(row['Category']).lower()


youtube_words = ['youtube', 'yt', 'https', 'http'] #Need to add more if required
fields = ['twitter_handle', 'video_id', 'likes', 'dislikes',
         'comments', 'views', 'title', 'description'
         'tags', 'publishat', 'cc_filename']

schema =  StructType([
    StructField('twitter_handle', StringType(), True),
    StructField('video_id', StringType(), True),
    StructField('likes', IntegerType(), True),
    StructField('dislikes', IntegerType(), True),
    StructField('comments', IntegerType(), True),
    StructField('views', IntegerType(), True),
    StructField('title', StringType(), True),
    StructField('description', StringType(), True),
    StructField('tags', ArrayType(StringType()), True),
    StructField('publishat', StringType(), True),
    StructField('cc_filename', StringType(), True),
])

def clean_cc(text):
    if text:
        cleaned = re.sub("(@[A-Za-z0-9_]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ", text).split()
        cleaned_ws = [word for word in cleaned if word not in stopwords and word not in facebook_words \
                     and word not in extended_stopwords]
        cleaned_wl = [lemma.lemmatize(word) for word in cleaned_ws]

        cleaned_wl = ' '.join(cleaned_ws) 
        return cleaned_wl.strip()
    else:
        return None
    
stopwords = stopwords.words('english')
lemma = nltk.wordnet.WordNetLemmatizer()

cleaner = udf(clean_description, StringType())

path = 'youtube-data'
youtube_files = [f for f in listdir(path) if isfile(join(path, f)) and (not f.startswith('.'))]

for youtube_file in youtube_files:
    print('Working on file: {}'.format(youtube_file))
    df = spark.read.json('youtube-data/{}'.format(youtube_file), schema=schema)
    cc_files = list(df.select('cc_filename').collect())
    category = category_dict[youtube_file]
    cc_path_prefix = '{}/others/{}/{}_cc/'.format(path, category, category)
    output_path_prefix = cc_path_prefix.replace(category + '_cc', category + '_cc_cleaned')
    output_directory = output_path_prefix + youtube_file + '/'
    if not exists(output_directory):
        makedirs(output_directory)
    for cc_file in cc_files:
        cc_name = cc_file['cc_filename']
        cc_file_path = cc_path_prefix + cc_name + '.txt'
        if exists(cc_file_path):
            cc_txt = sc.textFile(cc_file_path)
            cleaned_cc = cc_txt.map(clean_cc)
            output_path = output_directory + cc_name
            if not exists(output_path):
                cleaned_cc.saveAsTextFile(output_path)
            else:
                print("NOTICE-EXISTS;{}".format(output_path))

In [2]:
# Combine data into Single text file <Sources: Insta, FB, Twitter, FB>
# No cleaning
from pyspark.sql.types import StructField, StructType, StringType, ArrayType, IntegerType
from pyspark.sql.functions import lower, col, udf
from pyspark.sql import Row
from nltk.corpus import stopwords
import nltk
import pandas as pd
from os import listdir, makedirs
from os.path import isfile, join, exists
import json
import re

all_data = pd.read_csv('influencer_list.csv', sep=',')
print(all_data.shape)

# sc = spark.sparkContext

def get_handle(url):
    splits = url.split('/')
    if url:
        if url.endswith('/'):
            handle = splits[-2]
        else:
            handle = splits[-1]
    else:
        handle = None    
    return handle

tw_fields = ['screen_name', 'tweet_text', 'hashtags', 'favorites', 'retweet_count']
tw_schema =  StructType([
    StructField('screen_name', StringType(), True),
    StructField('tweet_text', StringType(), True),
    StructField('hashtags', ArrayType(StringType()), True),
    StructField('favorites', IntegerType(), True),
    StructField('retweet_count', IntegerType(), True),
])

ig_fields = ['twitter_handle', 'instagram_handle', 'likes', 'comments',
         'hashtags', 'caption', 'timestamp', 'image_thumbnail']

ig_schema =  StructType([
    StructField('twitter_handle', StringType(), True),
    StructField('instagram_handle', StringType(), True),
    StructField('likes', IntegerType(), True),
    StructField('comments', IntegerType(), True),
    StructField('hashtags', ArrayType(StringType()), True),
    StructField('caption', StringType(), True),
    StructField('timestamp', StringType(), True),
    StructField('image_thumbnail', StringType(), True),
])

fb_fields = ['twitter_handle', 'fb_handle', 'fb_name', 'fb_no_of_comments',
         'fb_time_created', 'fb_description', 'fb_post_link', 'fb_img_link'
         'fb_shares', 'fb_type']

fb_schema =  StructType([
    StructField('twitter_handle', StringType(), True),
    StructField('fb_handle', StringType(), True),
    StructField('fb_name', StringType(), True),
    StructField('fb_no_of_comments', IntegerType(), True),
    StructField('fb_time_created', StringType(), True),
    StructField('fb_description', StringType(), True),
    StructField('fb_post_link', StringType(), True),
    StructField('fb_img_link', StringType(), True),
    StructField('fb_shares', IntegerType(), True),
    StructField('fb_type', StringType(), True),
])

yt_fields = ['twitter_handle', 'video_id', 'likes', 'dislikes',
         'comments', 'views', 'title', 'description'
         'tags', 'publishat', 'cc_filename']

yt_schema =  StructType([
    StructField('twitter_handle', StringType(), True),
    StructField('video_id', StringType(), True),
    StructField('likes', IntegerType(), True),
    StructField('dislikes', IntegerType(), True),
    StructField('comments', IntegerType(), True),
    StructField('views', IntegerType(), True),
    StructField('title', StringType(), True),
    StructField('description', StringType(), True),
    StructField('tags', ArrayType(StringType()), True),
    StructField('publishat', StringType(), True),
    StructField('cc_filename', StringType(), True),
])

yt_path = 'youtube-data'
tw_path = 'twitter-data'
ig_path = 'instagram-data'
fb_path = 'facebook-data'

for index, row in all_data.iterrows():
    print(index, row)
    


(90, 6)
0 Influencer                                Kevin Curry
Category                                         Food
Twitter                https://twitter.com/fitmencook
Facebook         https://www.facebook.com/FitMenCook/
Instagram       https://www.instagram.com/FitMenCook/
Youtube       https://www.youtube.com/user/fitmencook
Name: 0, dtype: object
1 Influencer                                        Yolanda Gampp
Category                                                   Food
Twitter                    http://www.twitter.com/yolanda_gampp
Facebook        https://www.facebook.com/HowToCakeItWithYolanda
Instagram                    http://instagram.com/yolanda_gampp
Youtube       https://www.youtube.com/channel/UCvM1hVcRJmVWD...
Name: 1, dtype: object
2 Influencer                                     Izy Hossack
Category                                              Food
Twitter                  http://www.twitter.com/izyhossack
Facebook             https://www.facebook.com/izy.hossa