# Data Clean-up

In [1]:
import logging
import pandas as pd
from sys import getsizeof

# pd.options.display.max_rows = 4000

In [4]:
#Configure logging 
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

In [5]:
# Read pickle
try:
    records = pd.read_pickle("../data/interim/records.pkl")
except Exception as e: 
    logging.exception("Can't open data pickle!")


In [6]:
records.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1182572 entries, 0 to 1234999
Columns: 110 entries, all_awardings to user_reports
dtypes: bool(1), float64(32), int64(4), object(73)
memory usage: 993.6+ MB


In [7]:
records.dtypes

all_awardings                     object
associated_award                 float64
author                            object
author_flair_background_color     object
author_flair_css_class            object
                                  ...   
approved_at_utc                  float64
banned_at_utc                    float64
view_count                       float64
mod_reports                       object
user_reports                      object
Length: 110, dtype: object

## Remove deleted and removed records

Let's remove records with no `selftext` or that have the "\[removed\]" and "\[deleted\]" tags:


In [8]:
records = records[~records["selftext"].str.contains("\[removed\]|\[deleted\]|^$", case=False, regex=True, na=False)]
records = records[~records["body"].str.contains("\[removed\]|\[deleted\]|^$", case=False, regex=True, na=False)]


In [9]:
records.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1182572 entries, 0 to 1234999
Columns: 110 entries, all_awardings to user_reports
dtypes: bool(1), float64(32), int64(4), object(73)
memory usage: 993.6+ MB


In [0]:
records.to_pickle("../data/interim/records.pkl")

## Saving only interesting columns to a pickled dataframe: 

In [18]:
df = records[
        [
            "author",
            "body",
            "created_utc",
            "id",
            "retrieved_on",
            "updated_utc",
            "full_link",
            "score",
            "num_comments",
            "selftext",
            "title",
            "record",
        ]
    ]

# Merge body and selftext columns into one, replacing nas:    
df["body"] = df["body"].combine_first(df["selftext"])
df.drop("selftext", axis=1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["body"] = df["body"].combine_first(df["selftext"])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [19]:
df.head()

Unnamed: 0,author,body,created_utc,id,retrieved_on,updated_utc,full_link,score,num_comments,title,record
0,MrsCare2Share,We use a point system like the one recommended...,1580940674,fgna9j3,1580942818,,,1,,,comment
1,ADHDkitty,"Damn guys, I didn’t mean to make it go this fa...",1580940617,fgna5wx,1580942757,,,1,,,comment
2,blackmedusa941,No problem. Good luck 👍🏾,1580940580,fgna3kr,1580942716,,,1,,,comment
3,tofutak7000,I had this experience before being diagnosed. ...,1580940537,fgna0u8,1580942671,,,2,,,comment
4,Middle_Raccoon,"Yes, both of them, undiagnosed at the time and...",1580940526,fgna072,1580942661,,,2,,,comment


In [20]:
df.to_pickle("../data/interim/records_clean.pkl")

In [0]:
df.info()