# Analysis of Immigration Canada Subreddit

## Data Cleaning

In [15]:
# Importing libraries
import pandas as pd
import datetime as dt
from datetime import datetime
import seaborn as sns
from ydata_profiling import ProfileReport
from wordcloud import WordCloud
from wordcloud import STOPWORDS
import matplotlib.pyplot as plt


### Data Import

In [16]:
# Importing reddit data from csv
df_all = pd.read_csv('reddit_all_posts.csv')
df_all.name = 'df_all'
df_mixed = pd.read_csv('reddit_mixed_posts.csv')
df_mixed.name = 'df_mixed'
df_top = pd.read_csv('reddit_top_posts.csv')
df_top.name = 'df_top'

# Ensuring 'selftext' column or submission body type string
df_all['selftext'] = df_all['selftext'].astype(str)
df_mixed['selftext'] = df_mixed['selftext'].astype(str)
df_top['selftext'] = df_top['selftext'].astype(str)

### Missing body submission

In [17]:
# Identify the columns with duplicate submission or missing submission body
boolean = df_all.duplicated(subset=['selftext'])

In [18]:
df_all[boolean]['selftext']

14                                                     nan
15                                                     nan
25                                                     nan
31                                                     nan
35                                                     nan
                               ...                        
63816                                            [removed]
63817    My brother is in his mid 20s and has worked fo...
63818                                            [removed]
63819                                            [removed]
63820                                            [removed]
Name: selftext, Length: 63217, dtype: object

In [19]:
# Identify the number of submission body removed
df_all[df_all['selftext'] == '[removed]'].count()

Unnamed: 0         21679
Year               21679
selftext           21679
author             21679
author_fullname    17303
title              21679
link_flair_text    18237
ups                  373
num_comments       21679
id                 21679
media_embed        21679
created_utc        21679
upvote_ratio       20986
dtype: int64

In [20]:
# View submissions with submission body removed to identify if these records should be removed
df_all[df_all['selftext'] == '[removed]'].head()

Unnamed: 0.1,Unnamed: 0,Year,selftext,author,author_fullname,title,link_flair_text,ups,num_comments,id,media_embed,created_utc,upvote_ratio
321,321,2015,[removed],immigrationxiphias,,Migrate to Canada,,1.0,0,3yncn5,{},2015-12-29 12:10:57,
328,328,2015,[removed],immigrationxiphias,,Canada Immigration Services Calgary,,1.0,0,3y9xlv,{},2015-12-26 10:48:03,
329,329,2015,[removed],[deleted],,Canada Immigration Services Calgary,,1.0,0,3y9wxe,{},2015-12-26 10:37:57,
340,340,2015,[removed],immigrationxiphias,,XIPHIAS IMMIGRATION REVIEWS - CUSTOMER REVIEWS,,1.0,0,3xxptx,{},2015-12-23 06:58:44,
362,362,2015,[removed],ghimmigration,,Golden Hands Immigration Inc. GH Immigration Inc.,,0.0,0,3x1rxa,{},2015-12-16 06:27:00,


In [21]:
# Identify submissions with no submission body
df_all[df_all['selftext'] =='nan'].count()

Unnamed: 0         3084
Year               3084
selftext           3084
author             3084
author_fullname    2234
title              3084
link_flair_text    1864
ups                 605
num_comments       3084
id                 3084
media_embed        3084
created_utc        3084
upvote_ratio       1691
dtype: int64

In [22]:
# View submissions with no submission body to identify if these records should be kept
df_all[df_all['selftext'] == 'nan'].head(5)

Unnamed: 0.1,Unnamed: 0,Year,selftext,author,author_fullname,title,link_flair_text,ups,num_comments,id,media_embed,created_utc,upvote_ratio
9,9,2013,,CramBoom,,So you think you're Canadian? - Game-show to t...,,1.0,0,1pbs0y,"{'width': 600, 'height': 450, 'content': '&lt;...",2013-10-27 18:31:22,
14,14,2013,,ranger28,,Canadian Immigration | live and work in Canada...,,0.0,0,1md4wi,{},2013-09-14 06:22:35,
15,15,2013,,[deleted],,Canada’s new immigration program welcomes firs...,,1.0,0,1kknpp,{},2013-08-17 22:32:50,
25,25,2013,,ninjah778,,Immigration tips for students/visitors - getti...,,1.0,0,1bflor,{},2013-04-01 17:31:55,
31,31,2013,,[deleted],,At the moment. All posts seem to be hitting th...,,4.0,4,189fty,{},2013-02-10 20:05:51,


In [23]:
# Function to remove submissions with removed body or Nan
def submission_body(df):
    """Function to drop rows associated with submissions where selftext is removed or Nan
    Input: Dataframe
    Output: Count of submissions with no body, count of submission with removed body, and dataframe info"""
    no_body = df[df['selftext'] == 'nan']
    removed = df[df['selftext'] == '[removed]']
    df.drop(no_body.index, inplace=True)
    df.drop(removed.index, inplace=True)
    print('Submissions with no body removed:', no_body.count())
    print('Submission with removed body:', removed.count())
    print(df.info())
    return df

In [24]:
# Apply function to current dataframes
df_all_analysis = submission_body(df_all.copy())
df_mixed_analysis = submission_body(df_mixed.copy())
df_top_analysis = submission_body(df_top.copy())


Submissions with no body removed: Unnamed: 0         3084
Year               3084
selftext           3084
author             3084
author_fullname    2234
title              3084
link_flair_text    1864
ups                 605
num_comments       3084
id                 3084
media_embed        3084
created_utc        3084
upvote_ratio       1691
dtype: int64
Submission with removed body: Unnamed: 0         21679
Year               21679
selftext           21679
author             21679
author_fullname    17303
title              21679
link_flair_text    18237
ups                  373
num_comments       21679
id                 21679
media_embed        21679
created_utc        21679
upvote_ratio       20986
dtype: int64
<class 'pandas.core.frame.DataFrame'>
Int64Index: 39058 entries, 0 to 63817
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       39058 non-null  int64  
 1   Year             3905

### Missing category & Other category

#### Submission body

In [28]:
# Wordcloud application

In [87]:
# Create text for Wordcloud
text = "".join(i for i in df_all_analysis.selftext)

In [None]:
# Create wordcloud
custom_stop_words = ['Canada', 'know', 'want', 'now', 'will', 'got', 'year', 'still possible', 'need','able', 'apply', 'say', 'received', 'U', 'help', 'anyone','month', 'looking']
stopwords = list(STOPWORDS) + custom_stop_words

wordcloud = WordCloud(width=3000, height=2000, random_state=1, background_color='white', colormap='Pastel1', stopwords=stopwords).generate(text)
# Display the generated word cloud
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

#### Wordcloud of submissions with missing or other category

### Unsupported media