# **Propublica Dataset Validation**

In [156]:
import pandas as pd

In [157]:
df = pd.read_csv("/content/propublica10k-non_english-removed[articles+videos].csv")

**Exploring the dataset**

In [158]:
print(df.columns)
print(df.index)

Index(['article_link', 'title', 'author', 'author_link', 'article_content',
       'date_published', 'content_type', 'youtube_author', 'youtube_link'],
      dtype='object')
RangeIndex(start=0, stop=10118, step=1)


In [159]:
df.describe()

Unnamed: 0,youtube_author,youtube_link
count,0.0,0.0
mean,,
std,,
min,,
25%,,
50%,,
75%,,
max,,


In [160]:
df.head()

Unnamed: 0,article_link,title,author,author_link,article_content,date_published,content_type,youtube_author,youtube_link
0,https://www.propublica.org/article/rent-limits...,Rent Limits Just a Fiction for Thousands of NY...,"Cezary Podkul, Marcelo Rochabrun",https://www.propublica.org/people/cezary-podku...,About 28 percent of New York City apar...,2016-03-10T12:58:06-05:00,article,,
1,https://www.propublica.org/article/federal-jud...,Federal Judge Strikes Down Part of Montana’s F...,Marilyn W. Thompson,https://www.propublica.org/people/marilyn-thom...,"In a victory for public health advocates, a f...",2022-12-12T17:00:00-05:00,article,,
2,https://www.propublica.org/article/hrs-2011-sl...,Photos: The Doctor Will See Them Now,ProPublica,https://www.propublica.org/people/propublica,The Heart Rhythm Society held its 2011 confere...,2011-05-06T09:04:00-04:00,article,,
3,https://www.propublica.org/article/government-...,Government Does Not Test for Dangerous Bacteri...,Joaquin Sapien,https://www.propublica.org/people/joaquin-sapien,"Despite evidence that MRSA, an antibio...",2008-06-11T09:58:00-04:00,article,,
4,https://www.propublica.org/article/the-man-beh...,The Man Behind Mumbai,Sebastian Rotella,https://www.propublica.org/people/sebastian-ro...,This article was copublished with the ...,2010-11-13T21:39:17-05:00,article,,


In [161]:
df.tail()

Unnamed: 0,article_link,title,author,author_link,article_content,date_published,content_type,youtube_author,youtube_link
10113,https://www.propublica.org/article/tracking-hi...,Tracking Highway Stimulus Jobs Is No Easy Job,"Amanda Michel, Christopher Flavelle",https://www.propublica.org/people/amanda-miche...,The Obama administration has put great...,2009-07-24T14:18:00-04:00,article,,
10114,https://www.propublica.org/article/prominent-h...,Prominent Houston Judge Quits St. Luke’s Board...,Charles Ornstein,https://www.propublica.org/people/charles-orns...,A prominent federal judge quietly resigned fr...,2018-09-04T06:00:00-04:00,article,,
10115,https://www.propublica.org/article/sears-story...,The Sears Headquarters Deal Cost Taxpayers $50...,,,"Over the course of 30 years, the taxpayers of...",2020-05-16T06:00:00-04:00,article,,
10116,https://www.propublica.org/article/the-stimulu...,The Stimulus Bills: House vs. Senate,Michael Grabell,https://www.propublica.org/people/michael-grabell,"If all goes as expected today, the Sen...",2009-02-10T09:14:00-05:00,article,,
10117,https://www.propublica.org/article/in-minnesot...,"In Minnesota, Democratic Grandmas Gather Data ...",Lois Beckett,https://www.propublica.org/people/lois-beckett,"In Minnesota, Democratic volunteers sc...",2013-01-09T18:30:47-05:00,article,,


In [162]:
# show only the rows where the 'article_link' column does not start with 'https'
non_https_rows = df[~df['article_link'].str.startswith('http')]

# print the resulting dataframe
print(non_https_rows)

Empty DataFrame
Columns: [article_link, title, author, author_link, article_content, date_published, content_type, youtube_author, youtube_link]
Index: []


**Removing rows with missing values**



In [163]:
#checking the number of missing values in each columns
df.isna().sum()

article_link           0
title                  0
author               938
author_link          938
article_content        0
date_published         0
content_type           0
youtube_author     10118
youtube_link       10118
dtype: int64

In [164]:
df = df.drop(['youtube_author', 'youtube_link'], axis=1)
print(df.columns)

Index(['article_link', 'title', 'author', 'author_link', 'article_content',
       'date_published', 'content_type'],
      dtype='object')


In [165]:
# drop rows with missing values and reset the index
df = df.dropna().reset_index(drop=True)
df.isna().sum()

article_link       0
title              0
author             0
author_link        0
article_content    0
date_published     0
content_type       0
dtype: int64

**Removing rows with duplicate values**

In [166]:
#duplicate values
df.duplicated().sum()


0

In [167]:
df.describe()

Unnamed: 0,article_link,title,author,author_link,article_content,date_published,content_type
count,9180,9180,9180,9180,9180,9180,9180
unique,9180,9040,1068,1073,9131,8968,4
top,https://www.propublica.org/article/rent-limits...,This Week in Scandals,Marian Wang,https://www.propublica.org/people/marian-wang,"Welcome to SRSLY, an (experimental) ...",2009-05-06T13:47:50-04:00,article
freq,1,14,565,565,13,7,8869


In [168]:
df = df.drop_duplicates(subset=['article_content'], keep='first').reset_index(drop=True)
df.describe()

Unnamed: 0,article_link,title,author,author_link,article_content,date_published,content_type
count,9131,9131,9131,9131,9131,9131,9131
unique,9131,9008,1068,1073,9131,8930,4
top,https://www.propublica.org/article/rent-limits...,This Week in Scandals,Marian Wang,https://www.propublica.org/people/marian-wang,About 28 percent of New York City apar...,2010-05-03T13:30:00-04:00,article
freq,1,14,565,565,1,4,8820


In [169]:
df = df.drop_duplicates(subset=['title'], keep='first').reset_index(drop=True)
df.describe()

Unnamed: 0,article_link,title,author,author_link,article_content,date_published,content_type
count,9008,9008,9008,9008,9008,9008,9008
unique,9008,9008,1064,1068,9008,8820,4
top,https://www.propublica.org/article/rent-limits...,Rent Limits Just a Fiction for Thousands of NY...,Marian Wang,https://www.propublica.org/people/marian-wang,About 28 percent of New York City apar...,2012-05-25T06:00:00-04:00,article
freq,1,1,542,542,1,4,8705


In [170]:
duplicates = df[df['title'].duplicated(keep=False)]
print(duplicates['title'])

Series([], Name: title, dtype: object)


In [171]:
print(df.index)

RangeIndex(start=0, stop=9008, step=1)


**Adding Another Column 'Word Length'**

In [172]:
df.tail()

Unnamed: 0,article_link,title,author,author_link,article_content,date_published,content_type
9003,https://www.propublica.org/article/sam-cooper-...,Talking to an Investigative Reporter Who Expos...,Sebastian Rotella,https://www.propublica.org/people/sebastian-ro...,An exclusive news report dominated the headli...,2023-01-06T05:00:00-05:00,article
9004,https://www.propublica.org/article/incoming-re...,Incoming Regulator Promises No More Coddling o...,Jesse Eisinger,https://www.propublica.org/people/jesse-eisinger,The Office of the Comptroller of the C...,2012-06-13T11:00:00-04:00,article
9005,https://www.propublica.org/article/tracking-hi...,Tracking Highway Stimulus Jobs Is No Easy Job,"Amanda Michel, Christopher Flavelle",https://www.propublica.org/people/amanda-miche...,The Obama administration has put great...,2009-07-24T14:18:00-04:00,article
9006,https://www.propublica.org/article/prominent-h...,Prominent Houston Judge Quits St. Luke’s Board...,Charles Ornstein,https://www.propublica.org/people/charles-orns...,A prominent federal judge quietly resigned fr...,2018-09-04T06:00:00-04:00,article
9007,https://www.propublica.org/article/in-minnesot...,"In Minnesota, Democratic Grandmas Gather Data ...",Lois Beckett,https://www.propublica.org/people/lois-beckett,"In Minnesota, Democratic volunteers sc...",2013-01-09T18:30:47-05:00,article


In [173]:
df['word length'] = df['article_content'].apply(lambda x: len(x.split()))
print(df.head(3))

                                        article_link  \
0  https://www.propublica.org/article/rent-limits...   
1  https://www.propublica.org/article/federal-jud...   
2  https://www.propublica.org/article/hrs-2011-sl...   

                                               title  \
0  Rent Limits Just a Fiction for Thousands of NY...   
1  Federal Judge Strikes Down Part of Montana’s F...   
2               Photos: The Doctor Will See Them Now   

                             author  \
0  Cezary Podkul, Marcelo Rochabrun   
1               Marilyn W. Thompson   
2                        ProPublica   

                                         author_link  \
0  https://www.propublica.org/people/cezary-podku...   
1  https://www.propublica.org/people/marilyn-thom...   
2       https://www.propublica.org/people/propublica   

                                     article_content  \
0          About 28 percent of New York City apar...   
1   In a victory for public health advocates, a f...   


**Removing data that has less than 100 words article content**

In [174]:
less_word = df[df['word length'] < 100]
print(less_word)

                                           article_link  \
91    https://www.propublica.org/article/brief-tech-...   
104   https://www.propublica.org/article/map-injurie...   
110   https://www.propublica.org/article/one-doc-cou...   
153   https://www.propublica.org/article/lobbyists-f...   
156      https://www.propublica.org/article/bailout-map   
...                                                 ...   
8725  https://www.propublica.org/article/iraq-org-chart   
8758  https://www.propublica.org/article/propublicas...   
8799  https://www.propublica.org/article/document-di...   
8849  https://www.propublica.org/article/map-lenders...   
8963  https://www.propublica.org/article/tracking-op...   

                                                  title  \
91                            Brief Tech Troubles Fixed   
104   Map: Injuries and Deaths to Civilian Contracto...   
110   This One Doc Could Have Saved Medicare Million...   
153        Lobbyists for the Stimulus: Disclosure Forms

In [175]:
print(df.index)

RangeIndex(start=0, stop=9008, step=1)


In [176]:
# drop rows where 'word length' is lesser than 100
df.drop(df[df['word length'] < 100].index, inplace=True)

# reset the index
df.reset_index(drop=True, inplace=True)
print(df.describe())

        word length
count   8638.000000
mean    1219.145867
std     1325.069498
min      100.000000
25%      432.250000
50%      803.000000
75%     1432.750000
max    14426.000000


In [177]:
df.tail()

Unnamed: 0,article_link,title,author,author_link,article_content,date_published,content_type,word length
8633,https://www.propublica.org/article/sam-cooper-...,Talking to an Investigative Reporter Who Expos...,Sebastian Rotella,https://www.propublica.org/people/sebastian-ro...,An exclusive news report dominated the headli...,2023-01-06T05:00:00-05:00,article,2103
8634,https://www.propublica.org/article/incoming-re...,Incoming Regulator Promises No More Coddling o...,Jesse Eisinger,https://www.propublica.org/people/jesse-eisinger,The Office of the Comptroller of the C...,2012-06-13T11:00:00-04:00,article,903
8635,https://www.propublica.org/article/tracking-hi...,Tracking Highway Stimulus Jobs Is No Easy Job,"Amanda Michel, Christopher Flavelle",https://www.propublica.org/people/amanda-miche...,The Obama administration has put great...,2009-07-24T14:18:00-04:00,article,754
8636,https://www.propublica.org/article/prominent-h...,Prominent Houston Judge Quits St. Luke’s Board...,Charles Ornstein,https://www.propublica.org/people/charles-orns...,A prominent federal judge quietly resigned fr...,2018-09-04T06:00:00-04:00,article,942
8637,https://www.propublica.org/article/in-minnesot...,"In Minnesota, Democratic Grandmas Gather Data ...",Lois Beckett,https://www.propublica.org/people/lois-beckett,"In Minnesota, Democratic volunteers sc...",2013-01-09T18:30:47-05:00,article,1375


**Removing data that has greater than or equal to 10000 words article content**

In [178]:
filtered_df = df[df['word length'] >= 10000]
print(filtered_df)

                                           article_link  \
440   https://www.propublica.org/article/mexico-drug...   
903   https://www.propublica.org/article/milwaukee-f...   
1477  https://www.propublica.org/article/hes-a-liar-...   
1880  https://www.propublica.org/article/two-coasts-...   
3805  https://www.propublica.org/article/philadelphi...   
4106  https://www.propublica.org/article/9-11-invest...   
4752  https://www.propublica.org/article/mia-mottley...   
4757  https://www.propublica.org/article/the-deadly-...   
5150  https://www.propublica.org/article/inside-the-...   
5291  https://www.propublica.org/article/finding-osc...   
5490  https://www.propublica.org/article/hell-and-hi...   
5930  https://www.propublica.org/article/afghanistan...   
7892  https://www.propublica.org/article/living-apar...   

                                                  title  \
440     Inside the Case that Upended America’s Drug War   
903   The Landlord, the Tenant and a House Fire in M...

In [179]:
# drop rows where 'word length' is greater than or equal to 10000
df.drop(df[df['word length'] >= 10000].index, inplace=True)

# reset the index
df.reset_index(drop=True, inplace=True)
print(df['word length'].describe())

count    8625.000000
mean     1203.713391
std      1263.918806
min       100.000000
25%       432.000000
50%       801.000000
75%      1427.000000
max      9726.000000
Name: word length, dtype: float64


In [180]:
df.tail()

Unnamed: 0,article_link,title,author,author_link,article_content,date_published,content_type,word length
8620,https://www.propublica.org/article/sam-cooper-...,Talking to an Investigative Reporter Who Expos...,Sebastian Rotella,https://www.propublica.org/people/sebastian-ro...,An exclusive news report dominated the headli...,2023-01-06T05:00:00-05:00,article,2103
8621,https://www.propublica.org/article/incoming-re...,Incoming Regulator Promises No More Coddling o...,Jesse Eisinger,https://www.propublica.org/people/jesse-eisinger,The Office of the Comptroller of the C...,2012-06-13T11:00:00-04:00,article,903
8622,https://www.propublica.org/article/tracking-hi...,Tracking Highway Stimulus Jobs Is No Easy Job,"Amanda Michel, Christopher Flavelle",https://www.propublica.org/people/amanda-miche...,The Obama administration has put great...,2009-07-24T14:18:00-04:00,article,754
8623,https://www.propublica.org/article/prominent-h...,Prominent Houston Judge Quits St. Luke’s Board...,Charles Ornstein,https://www.propublica.org/people/charles-orns...,A prominent federal judge quietly resigned fr...,2018-09-04T06:00:00-04:00,article,942
8624,https://www.propublica.org/article/in-minnesot...,"In Minnesota, Democratic Grandmas Gather Data ...",Lois Beckett,https://www.propublica.org/people/lois-beckett,"In Minnesota, Democratic volunteers sc...",2013-01-09T18:30:47-05:00,article,1375


In [181]:
print(df.index)

RangeIndex(start=0, stop=8625, step=1)


**Exporting the validated dataset**

In [182]:
df.to_csv('validated_8625_data_propublica.csv', index=False)