In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from seaborn import set_style
set_style("whitegrid")

In [9]:
def clean_comments_csv(input_filepath, output_filepath):
    # Read the CSV file into a DataFrame
    comment_df = pd.read_csv(input_filepath)
    
    # Drop duplicates based on 'Post' and 'Comment' columns
    cleaned_df = comment_df.drop_duplicates(
        subset=['Post', 'Comment'],
        keep='last'
    ).reset_index(drop=True)
    
    # Print the number of duplicates dropped
    print(len(comment_df) - len(cleaned_df), 'duplicates dropped from', input_filepath)
    
    # Remove rows where 'Comment' is 'Comment'
    cleaned_df = cleaned_df[cleaned_df.Comment != 'Comment']
    
    # Save the cleaned DataFrame to the new file
    cleaned_df.to_csv(output_filepath, index=False)

In [18]:
input_filepath = 'C:/Users/ESK/Documents/DataScience/working/datafiles/bpd_memes.csv'
output_filepath = 'cleaned_bpd_memes.csv'

In [15]:
#bpd comments
clean_comments_csv(input_filepath, output_filepath)

48904 duplicates dropped from C:/Users/ESK/Documents/DataScience/working/datafiles/bpd.csv


In [17]:
#borderlinePDisorder comments
clean_comments_csv(input_filepath, output_filepath)

70397 duplicates dropped from C:/Users/ESK/Documents/DataScience/working/datafiles/BorderlinePDisorder.csv


In [19]:
#bpd_memes comments
clean_comments_csv(input_filepath, output_filepath)

0 duplicates dropped from C:/Users/ESK/Documents/DataScience/working/datafiles/bpd_memes.csv


In [4]:
#create separate dataframes 
cleaned_df1 = pd.read_csv("C:/Users/ESK/Documents/DataScience/working/datafiles/cleaned/cleaned_bpd.csv")
cleaned_df2 = pd.read_csv("C:/Users/ESK/Documents/DataScience/working/datafiles/cleaned/cleaned_BorderlinePDisorder.csv")
cleaned_df3 = pd.read_csv("C:/Users/ESK/Documents/DataScience/working/datafiles/cleaned/cleaned_bpd_memes.csv")


In [5]:
cleaned_df1

Unnamed: 0.1,Unnamed: 0,Comment,Author,Post
0,52,This post has been marked as a [Venting Post](...,AutoModerator,1c51zji
1,53,Typically a good therapists goal is to help yo...,twinangeldeer,1c51zji
2,54,BPD is notoriously difficult to treat by talk ...,SheNeverDies,1c51zji
3,55,It is really hard to find a good therapist lik...,MeanGreenMother1986,1c51zji
4,56,My therapist told me to just act indifferent w...,Beautiful_Witness748,1c51zji
...,...,...,...,...
65089,113993,[deleted],,ny3ndl
65090,113994,What dose are you on?,depressedthotty,ny3ndl
65091,113995,I’ve been on amisulpride for maybe a year or a...,mhthrowaway7382,ny3ndl
65092,113996,100mg,mhthrowaway7382,ny3ndl


In [6]:
cleaned_df2

Unnamed: 0.1,Unnamed: 0,Comment,Author,Post
0,49047,"adding to your question, did anyone have a the...",grassycroissant,1bmk9m2
1,49048,Like 10 years. I’ve had good experiences and b...,charlottewonder,1bmk9m2
2,49049,"One session and I dipped , because I literally...",Right_now78,1bmk9m2
3,49050,"I’m 28f go every two weeks now, used to go wee...",ComplaintRepulsive52,1bmk9m2
4,49051,I’ve been seeing a therapist since the age of ...,ChocCoveredSarcasm,1bmk9m2
...,...,...,...,...
27672,98069,[deleted],,w5q5qv
27673,98070,I was on zoloft for several years in my early ...,KronikHaze,w5q5qv
27674,98071,I took Lamictal and Lithium and I hated both.,KronikHaze,w5q5qv
27675,98072,What was your dosage for lamictal? (I never tr...,aluap_mia,w5q5qv


In [7]:
cleaned_df3

Unnamed: 0.1,Unnamed: 0,Comment,Author,Post
0,247,My mom: don't ever EVER tell ANYBODY that you...,ayaskl,gh3qmr
1,248,Boomers care so much about how they look so if...,i_always_give_karma,gh3qmr
2,249,My therapist tells me not to talk to people ab...,,gh3qmr
3,250,My aunt looked so shocked when I was like “Yea...,gaybitch97,gh3qmr
4,251,"Whoops...sorry folks, I’m a boomer...\n\nActua...",MikiesMom2017,gh3qmr
...,...,...,...,...
21782,38276,I like to stop taking my meds for a week or so...,pittNPatter,nyer4v
21783,38277,Me when I take my meds knowing that they’ll ma...,,nyer4v
21784,38278,Can I ask what meds work for you and their sid...,enola98,nyer4v
21785,38279,"the one that works for my bpd is abilify, whic...",ionichoneycomb,nyer4v


In [11]:
#create datafile combining all 3 subreddit 

combined_cleaned_df = pd.concat([cleaned_df1, cleaned_df2, cleaned_df3], ignore_index=True)

combined_cleaned_df

Unnamed: 0.1,Unnamed: 0,Comment,Author,Post
0,52,This post has been marked as a [Venting Post](...,AutoModerator,1c51zji
1,53,Typically a good therapists goal is to help yo...,twinangeldeer,1c51zji
2,54,BPD is notoriously difficult to treat by talk ...,SheNeverDies,1c51zji
3,55,It is really hard to find a good therapist lik...,MeanGreenMother1986,1c51zji
4,56,My therapist told me to just act indifferent w...,Beautiful_Witness748,1c51zji
...,...,...,...,...
114553,38276,I like to stop taking my meds for a week or so...,pittNPatter,nyer4v
114554,38277,Me when I take my meds knowing that they’ll ma...,,nyer4v
114555,38278,Can I ask what meds work for you and their sid...,enola98,nyer4v
114556,38279,"the one that works for my bpd is abilify, whic...",ionichoneycomb,nyer4v


In [12]:
combined_cleaned_df.drop(columns=["Unnamed: 0"], inplace=True)

In [13]:
print(combined_cleaned_df)

                                                  Comment  \
0       This post has been marked as a [Venting Post](...   
1       Typically a good therapists goal is to help yo...   
2       BPD is notoriously difficult to treat by talk ...   
3       It is really hard to find a good therapist lik...   
4       My therapist told me to just act indifferent w...   
...                                                   ...   
114553  I like to stop taking my meds for a week or so...   
114554  Me when I take my meds knowing that they’ll ma...   
114555  Can I ask what meds work for you and their sid...   
114556  the one that works for my bpd is abilify, whic...   
114557                                         Thank you.   

                      Author     Post  
0              AutoModerator  1c51zji  
1              twinangeldeer  1c51zji  
2               SheNeverDies  1c51zji  
3        MeanGreenMother1986  1c51zji  
4       Beautiful_Witness748  1c51zji  
...                      ..

In [14]:
combined_cleaned_df.to_csv("combined_cleaned_df.csv", index=True)