# Cleaning the NYT-Article Dataframe

In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

## Importing the dataframe

In [2]:
data = pd.read_csv('NYT_Australia_articles.csv')

In [3]:
data_cl=data.copy()

## Getting Information to the dataframe

In [4]:
data_cl.head()

Unnamed: 0,web_url,source,headline,keyword1,keyword2,keyword3,keyword4,keyword5,pub_date,document_type,news_desk,section_name,subsection_name,type_of_material,word_count
0,https://www.nytimes.com/2022/10/30/world/austr...,The New York Times,Tensions Over Racism Rock Australian Netball,Australia,Netball (Sport),Discrimination,Indigenous Australians,,2022-10-30 10:53:55+00:00,article,Foreign,World,Australia,News,1321
1,https://www.nytimes.com/2022/10/29/world/austr...,The New York Times,"After Years of ‘Hell’ in ISIS Detention Camp, ...",Al Hol Detention Camp (Syria),Islamic State in Iraq and Syria (ISIS),Australia,Syria,Detainees,2022-10-29 04:39:54+00:00,article,Foreign,World,Australia,News,910
2,https://www.nytimes.com/2022/10/21/world/austr...,The New York Times,How Australia Fell Behind on Data Privacy,Cyberattacks and Hackers,Privacy,Regulation and Deregulation of Industry,Australia,,2022-10-21 10:57:58+00:00,article,Foreign,World,Australia,News,1054
3,https://www.nytimes.com/2022/10/17/business/ap...,The New York Times,New Crack in Apple’s Armor as Dozens Strike at...,Apple Inc,Australia,Strikes,Organized Labor,Wages and Salaries,2022-10-18 02:28:24+00:00,article,Business,Business Day,,News,1165
4,https://www.nytimes.com/video/world/australia/...,AP,Flooding Batters Southeast Australia,Australia,Floods,Coastal Areas,Deaths (Fatalities),Evacuations and Evacuees,2022-10-15 17:53:20+00:00,multimedia,,World,Australia,Video,0


In [5]:
data_cl.shape

(14054, 15)

In [6]:
data_cl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14054 entries, 0 to 14053
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   web_url           14054 non-null  object
 1   source            14013 non-null  object
 2   headline          14054 non-null  object
 3   keyword1          11017 non-null  object
 4   keyword2          10509 non-null  object
 5   keyword3          10007 non-null  object
 6   keyword4          9121 non-null   object
 7   keyword5          7632 non-null   object
 8   pub_date          14054 non-null  object
 9   document_type     14054 non-null  object
 10  news_desk         13922 non-null  object
 11  section_name      14041 non-null  object
 12  subsection_name   5720 non-null   object
 13  type_of_material  13999 non-null  object
 14  word_count        14054 non-null  int64 
dtypes: int64(1), object(14)
memory usage: 1.6+ MB


## Check for Duplicates and NaNs

In [7]:
# Duplicates:

In [8]:
data_cl = data_cl.drop_duplicates()

In [9]:
data_cl.shape

(13539, 15)

In [10]:
# 515 duplicates got deleted

In [11]:
# NaNs:

In [12]:
data_cl.isna().sum()

web_url                0
source                39
headline               0
keyword1            2868
keyword2            3363
keyword3            3850
keyword4            4709
keyword5            6152
pub_date               0
document_type          0
news_desk            131
section_name          12
subsection_name     7980
type_of_material      52
word_count             0
dtype: int64

In [13]:
data_cl.isna().sum()

web_url                0
source                39
headline               0
keyword1            2868
keyword2            3363
keyword3            3850
keyword4            4709
keyword5            6152
pub_date               0
document_type          0
news_desk            131
section_name          12
subsection_name     7980
type_of_material      52
word_count             0
dtype: int64

### checking columns with a lot of NaNs:

In [14]:
# Source:

In [15]:
data_cl['source'].value_counts(dropna=False)

The New York Times              13156
International New York Times      274
NaN                                39
AP                                 33
Reuters                            23
Wirecutter                         11
WQXR                                3
Name: source, dtype: int64

In [16]:
data_cl = data_cl.dropna(subset=['source'])

In [17]:
# News Desk:

In [18]:
data_cl['news_desk'].value_counts(dropna=False)

NYTNow         2683
Foreign        2407
Sports         1538
Business        796
Culture         599
               ... 
Burst             1
Technology        1
Movies            1
Automobiles       1
Test              1
Name: news_desk, Length: 74, dtype: int64

In [19]:
data_cl['news_desk'] = data_cl['news_desk'].replace(np.nan, 'None')

In [20]:
# section_name:

In [21]:
data_cl['section_name'].value_counts(dropna=False)

Briefing                       2728
World                          2606
Sports                         1762
Arts                            751
Business Day                    735
Opinion                         652
U.S.                            635
Science                         322
Books                           269
Food                            255
Magazine                        254
Style                           237
Travel                          209
Movies                          199
Climate                         195
Technology                      180
Health                          170
Fashion & Style                 146
Theater                         127
The Learning Network            120
T Magazine                      119
Obituaries                      115
Well                            112
New York                        108
Real Estate                      97
Podcasts                         46
Corrections                      42
Parenting                   

In [22]:
data_cl = data_cl.dropna(subset=['section_name'])

In [23]:
# subsection_name:

In [24]:
data_cl['subsection_name'].value_counts(dropna=False)

NaN                       7930
Australia                 1057
Tennis                     783
Asia Pacific               745
Europe                     338
                          ... 
Retirement                   1
International Business       1
Opinion | Culture            1
Travel                       1
Toddler                      1
Name: subsection_name, Length: 77, dtype: int64

In [25]:
# Too many NaNs, dropping the column later

In [26]:
# type_of_material:

In [27]:
data_cl['type_of_material'].value_counts(dropna=False)

News                   8682
briefing               2757
Interactive Feature     554
Op-Ed                   542
Review                  278
Obituary (Obit)         259
Video                    89
Letter                   66
Newsletter               48
Correction               36
Editorial                34
Schedule                 31
News Analysis            30
List                     29
Slideshow                23
Quote                    10
Biography                 7
Question                  6
NaN                       4
An Appraisal              1
Audio Podcast             1
Interview                 1
Text                      1
Name: type_of_material, dtype: int64

In [28]:
data_cl = data_cl.dropna(subset=['type_of_material'])

In [29]:
# keyword1:

In [30]:
data_cl['keyword1'].value_counts(dropna=False)

NaN                                   2829
Tennis                                 693
Coronavirus (2019-nCoV)                543
Australia                              288
Books and Literature                   230
                                      ... 
Qatar Airways                            1
Imahara, Grant (1970-2020)               1
Santa Cruz (Calif)                       1
Green Lantern (Fictional Character       1
Electoral College                        1
Name: keyword1, Length: 2270, dtype: int64

In [31]:
data_cl['keyword1'] = data_cl['keyword1'].replace(np.nan, 'None')

In [32]:
# keyword2:

In [33]:
data_cl['keyword2'].value_counts(dropna=False)

NaN                                     3321
Coronavirus (2019-nCoV)                  271
Tennis                                   206
Australia                                187
Politics and Government                  177
                                        ... 
Borotra, Jean                              1
Winton, Tim                                1
Weller, Kurt (Crossword Constructor)       1
Cromwell, James                            1
House Committee on Intelligence            1
Name: keyword2, Length: 2818, dtype: int64

In [34]:
data_cl['keyword2'] = data_cl['keyword2'].replace(np.nan, 'None')

In [35]:
# keyword3:

In [36]:
data_cl['keyword3'].value_counts(dropna=False)

NaN                                       3808
Australia                                  240
Coronavirus (2019-nCoV)                    201
Politics and Government                    146
United States Politics and Government       80
                                          ... 
Sex                                          1
Tripp, Linda R                               1
Civil Rights Movement (1954-68)              1
Reggae Music                                 1
Reading and Writing Skills (Education)       1
Name: keyword3, Length: 3319, dtype: int64

In [37]:
data_cl['keyword3'] = data_cl['keyword3'].replace(np.nan, 'None')

In [38]:
# keyword4:

In [39]:
data_cl['keyword4'].value_counts(dropna=False)

NaN                                         4665
Australia                                    285
Coronavirus (2019-nCoV)                      151
Politics and Government                      110
China                                         74
                                            ... 
Dynasty (TV Program)                           1
Blank, Radha                                   1
The Mystery of Charles Dickens (Book)          1
Trades (Sports)                                1
Sutherland Springs, Tex, Shooting (2017)       1
Name: keyword4, Length: 3356, dtype: int64

In [40]:
# Too many NaNs, dropping the column later

In [41]:
# keyword5:

In [42]:
data_cl['keyword5'].value_counts(dropna=False)

NaN                                           6105
Australia                                      228
Politics and Government                         83
Coronavirus (2019-nCoV)                         82
China                                           73
                                              ... 
Li Keqiang                                       1
Jingle Jangle: A Christmas Journey (Movie)       1
Religious Cults                                  1
Person to Person (Movie)                         1
Azores Islands                                   1
Name: keyword5, Length: 3146, dtype: int64

In [43]:
# Too many NaNs, dropping the column later

## Dropround Nr. 1:

In [44]:
data_cl = data_cl.drop(['web_url', 'keyword4', 'keyword5', 'subsection_name'], axis=1)

In [45]:
data_cl.isna().sum()

source              0
headline            0
keyword1            0
keyword2            0
keyword3            0
pub_date            0
document_type       0
news_desk           0
section_name        0
type_of_material    0
word_count          0
dtype: int64

## Changing the date_time of pub_date

In [46]:
data_cl['pub_date'].dtype

dtype('O')

In [47]:
data_cl['pub_date'] = pd.to_datetime(data['pub_date'], errors='coerce')

In [48]:
data_cl['pub_date'].dtype

datetime64[ns, UTC]

In [49]:
data_cl['pub_date'] = pd.to_datetime(data_cl['pub_date']).dt.date

In [50]:
data_cl.head()

Unnamed: 0,source,headline,keyword1,keyword2,keyword3,pub_date,document_type,news_desk,section_name,type_of_material,word_count
0,The New York Times,Tensions Over Racism Rock Australian Netball,Australia,Netball (Sport),Discrimination,2022-10-30,article,Foreign,World,News,1321
1,The New York Times,"After Years of ‘Hell’ in ISIS Detention Camp, ...",Al Hol Detention Camp (Syria),Islamic State in Iraq and Syria (ISIS),Australia,2022-10-29,article,Foreign,World,News,910
2,The New York Times,How Australia Fell Behind on Data Privacy,Cyberattacks and Hackers,Privacy,Regulation and Deregulation of Industry,2022-10-21,article,Foreign,World,News,1054
3,The New York Times,New Crack in Apple’s Armor as Dozens Strike at...,Apple Inc,Australia,Strikes,2022-10-18,article,Business,Business Day,News,1165
4,AP,Flooding Batters Southeast Australia,Australia,Floods,Coastal Areas,2022-10-15,multimedia,,World,Video,0


## Checking datatypes

In [51]:
data_cl.dtypes

source              object
headline            object
keyword1            object
keyword2            object
keyword3            object
pub_date            object
document_type       object
news_desk           object
section_name        object
type_of_material    object
word_count           int64
dtype: object

In [52]:
data_cl.head()

Unnamed: 0,source,headline,keyword1,keyword2,keyword3,pub_date,document_type,news_desk,section_name,type_of_material,word_count
0,The New York Times,Tensions Over Racism Rock Australian Netball,Australia,Netball (Sport),Discrimination,2022-10-30,article,Foreign,World,News,1321
1,The New York Times,"After Years of ‘Hell’ in ISIS Detention Camp, ...",Al Hol Detention Camp (Syria),Islamic State in Iraq and Syria (ISIS),Australia,2022-10-29,article,Foreign,World,News,910
2,The New York Times,How Australia Fell Behind on Data Privacy,Cyberattacks and Hackers,Privacy,Regulation and Deregulation of Industry,2022-10-21,article,Foreign,World,News,1054
3,The New York Times,New Crack in Apple’s Armor as Dozens Strike at...,Apple Inc,Australia,Strikes,2022-10-18,article,Business,Business Day,News,1165
4,AP,Flooding Batters Southeast Australia,Australia,Floods,Coastal Areas,2022-10-15,multimedia,,World,Video,0


## Dealing with values of categorical columns

### Column "source"

In [53]:
data_cl['source'].value_counts(dropna=False)

The New York Times              13155
International New York Times      274
AP                                 33
Reuters                            23
Name: source, dtype: int64

In [54]:
def cleanSource(x):
    if x in ['The New York Times']:
        return 'NYT'
    elif x in ['International New York Times']:
        return 'InternationalNYT'
    else:
        return 'others'

data_cl['source'] = list(map(cleanSource, data_cl['source']))

In [55]:
data_cl['source'].value_counts(dropna=False)

NYT                 13155
InternationalNYT      274
others                 56
Name: source, dtype: int64

### Column "headline"

In [56]:
data_cl['headline'].value_counts(dropna=False)

Coronavirus Briefing: What Happened Today                              101
Your Monday Briefing                                                    83
Your Tuesday Briefing                                                   80
Your Friday Briefing                                                    77
Your Thursday Briefing                                                  65
                                                                      ... 
The Lockdown That Felt Like It Might Last Forever Has Finally Ended      1
Australia, Tell Us the Places in the World You Love the Most             1
In One of the World’s Longest Lockdowns, One Man Is Omnipresent          1
The Great Barrier Reef Has Lost Half Its Corals                          1
One Man’s Self-Imposed News ‘Blockade’                                   1
Name: headline, Length: 12827, dtype: int64

In [57]:
# 12827 unique values, not possible to use it as categorical 
# use it in a different nb for NLP Model
# drop it later on

### Column "keyword1"

In [58]:
data_cl['keyword1'].value_counts(dropna=False)

None                                  2829
Tennis                                 693
Coronavirus (2019-nCoV)                543
Australia                              288
Books and Literature                   230
                                      ... 
Qatar Airways                            1
Imahara, Grant (1970-2020)               1
Santa Cruz (Calif)                       1
Green Lantern (Fictional Character       1
Electoral College                        1
Name: keyword1, Length: 2270, dtype: int64

In [59]:
def cleanKeyword(x):
    if x in ['None']:
        return 'None'
    elif x in ['Australia']:
        return 'Australia'    
    elif x in ['Coronavirus (2019-nCoV)', 'Vaccination and Immunization']:
        return 'Covid'          
    elif x in ['Books and Literature', 'Movies', 'Television', 'Theater', 'Art']:
        return 'Culture'   
    elif x in ['Tennis', 'Australian Open (Tennis)', 'United States Open (Tennis)', 'French Open (Tennis)', 'Wimbledon Tennis Tournament', 'Djokovic, Novak']:
        return 'Tennis'
    elif x in ['China']:
        return 'China'       
    elif x in ['Politics and Government']:
        return 'Politics and Government'        
    elif x in ['Global Warming', 'Greenhouse Gas Emissions', '']:
        return 'Global Warming'
    elif x in ['Travel and Vacations']:
        return 'Travel and Vacations'
    elif x in ['United States International Relations', 'United States Politics and Government', 'Trump']:
        return 'US politics and relations'
    else:
        return 'others'

In [60]:
data_cl['keyword1'] = list(map(cleanKeyword, data_cl['keyword1']))

In [61]:
data_cl['keyword1'].value_counts(dropna=False)

others                       7214
None                         2829
Tennis                        909
Culture                       815
Covid                         583
Australia                     288
Politics and Government       217
US politics and relations     188
Global Warming                161
China                         144
Travel and Vacations          137
Name: keyword1, dtype: int64

### Column "keyword2"

In [62]:
data_cl['keyword2'].value_counts(dropna=False)

None                                    3321
Coronavirus (2019-nCoV)                  271
Tennis                                   206
Australia                                187
Politics and Government                  177
                                        ... 
Borotra, Jean                              1
Winton, Tim                                1
Weller, Kurt (Crossword Constructor)       1
Cromwell, James                            1
House Committee on Intelligence            1
Name: keyword2, Length: 2818, dtype: int64

In [63]:
data_cl['keyword2'] = list(map(cleanKeyword, data_cl['keyword2']))

In [64]:
data_cl['keyword2'].value_counts(dropna=False)

others                       8040
None                         3321
Tennis                        702
Covid                         351
Culture                       271
Australia                     187
Politics and Government       177
US politics and relations     137
Global Warming                121
China                         113
Travel and Vacations           65
Name: keyword2, dtype: int64

### Column "keyword3"

In [65]:
data_cl['keyword3'].value_counts(dropna=False)

None                                      3808
Australia                                  240
Coronavirus (2019-nCoV)                    201
Politics and Government                    146
United States Politics and Government       80
                                          ... 
Sex                                          1
Tripp, Linda R                               1
Civil Rights Movement (1954-68)              1
Reggae Music                                 1
Reading and Writing Skills (Education)       1
Name: keyword3, Length: 3319, dtype: int64

In [66]:
data_cl['keyword3'] = list(map(cleanKeyword, data_cl['keyword3']))

In [67]:
data_cl['keyword3'].value_counts(dropna=False)

others                       8313
None                         3808
Covid                         265
Australia                     240
Tennis                        230
Politics and Government       146
US politics and relations     145
Culture                       114
Global Warming                107
China                          71
Travel and Vacations           46
Name: keyword3, dtype: int64

### Column "document_type"

In [68]:
data_cl['document_type'].value_counts(dropna=False)

article       12818
multimedia      666
audio             1
Name: document_type, dtype: int64

In [69]:
data_cl[(data_cl['document_type']=='audio')].head()

Unnamed: 0,source,headline,keyword1,keyword2,keyword3,pub_date,document_type,news_desk,section_name,type_of_material,word_count
11159,NYT,NYT: Back Story With The Times’s Aubrey Belford,,,,2018-08-28,audio,,Today's Headlines,Audio Podcast,0


In [70]:
data_cl = data_cl.drop(labels=11159, axis=0)

In [71]:
data_cl = data_cl[data_cl.document_type != 'multimedia']


### Column "news_desk"

In [72]:
data_cl['news_desk'].value_counts(dropna=False)

NYTNow             2683
Foreign            2400
Sports             1503
Business            787
Culture             598
OpEd                549
Science             432
SpecialSections     325
Washington          267
Styles              267
Obits               248
Dining              228
BookReview          208
Arts&Leisure        202
Weekend             200
Travel              189
Express             188
National            161
Climate             151
Magazine            125
TStyle              105
Well                 96
Learning             91
RealEstate           75
SundayBusiness       72
Metro                70
Letters              55
Society              54
NewsDesk             48
Corrections          42
Parenting            38
Editorial            37
Games                35
Metropolitan         32
Summary              29
Podcasts             29
Politics             28
Watching             28
AtHome               21
Investigative        21
Upshot               21
Insider         

In [73]:
def cleanNewsdesk(x):
    if x in ['NYTNow']:
        return 'NYTNow'
    elif x in ['Foreign']:
        return 'Foreign'    
    elif x in ['Sports']:
        return 'Sports'          
    elif x in ['Business']:
        return 'Business'   
    elif x in ['Culture']:
        return 'Culture'
    elif x in ['OpEd']:
        return 'OpEd'       
    elif x in ['Science']:
        return 'Science'  
    elif x in ['SpecialSections']:
        return 'SpecialSections'
    elif x in ['Washington']:
        return 'Washington'
    elif x in ['Styles']:
        return 'Styles'
    elif x in ['Magazine']:
        return 'Magazine'
    elif x in ['Obits']:
        return 'Obits'
    elif x in ['Dining']:
        return 'Dining'
    elif x in ['BookReview']:
        return 'BookReview'
    elif x in ['Arts&Leisure']:
        return 'Arts&Leisure'
    elif x in ['Travel']:
        return 'Travel'
    elif x in ['Weekend']:
        return 'Weekend'
    elif x in ['Express']:
        return 'Express'
    elif x in ['Climate']:
        return 'Climate'
    elif x in ['National']:
        return 'Natural'
    elif x in ['Well']:
        return 'Well'
    elif x in ['TStyle']:
        return 'TStyle'
    else:
        return 'others'

In [74]:
data_cl['news_desk'] = list(map(cleanNewsdesk, data_cl['news_desk']))

In [75]:
data_cl['news_desk'].value_counts(dropna=False)

NYTNow             2683
Foreign            2400
Sports             1503
others              906
Business            787
Culture             598
OpEd                549
Science             432
SpecialSections     325
Washington          267
Styles              267
Obits               248
Dining              228
BookReview          208
Arts&Leisure        202
Weekend             200
Travel              189
Express             188
Natural             161
Climate             151
Magazine            125
TStyle              105
Well                 96
Name: news_desk, dtype: int64

### Column "section_name"

In [76]:
data_cl['section_name'].value_counts(dropna=False)

Briefing                2660
World                   2497
Sports                  1717
Arts                     736
Business Day             726
Opinion                  625
U.S.                     590
Science                  304
Books                    261
Style                    233
Food                     230
Movies                   198
Travel                   197
Technology               176
Climate                  169
Health                   156
Fashion & Style          145
Theater                  127
Magazine                 124
New York                 105
T Magazine               105
Well                     102
Obituaries                99
The Learning Network      92
Real Estate               91
Podcasts                  44
Corrections               42
Parenting                 38
Crosswords & Games        36
Reader Center             31
Watching                  29
The Upshot                23
Times Insider             22
At Home                   22
Smarter Living

In [77]:
def cleanSection(x):
    if x in ['Briefing']:
        return 'Briefing'
    elif x in ['World']:
        return 'World'    
    elif x in ['Sports']:
        return 'Sports' 
    elif x in ['Arts']:
        return 'Arts'   
    elif x in ['Business Day']:
        return 'Business Day'
    elif x in ['Opinion']:
        return 'Opinion'    
    elif x in ['U.S.']:
        return 'U.S.'  
    elif x in ['Science']:
        return 'Science'  
    elif x in ['Books']:
        return 'Books'
    elif x in ['Magazine']:
        return 'Magazine'
    elif x in ['Food']:
        return 'Food'
    elif x in ['Style']:
        return 'Style'
    elif x in ['Travel']:
        return 'Travel'
    elif x in ['Movies']:
        return 'Movies'
    elif x in ['Climate']:
        return 'Climate'
    elif x in ['Technology']:
        return 'Technology'
    elif x in ['Health']:
        return 'Health'
    elif x in ['Fashion & Style']:
        return 'Fashion&Style'
    elif x in ['Theater']:
        return 'Theater'
    elif x in ['The Learning Network']:
        return 'The Learning Network'
    elif x in ['T Magazine']:
        return 'T Magazine'
    elif x in ['Obituaries']:
        return 'Obit'
    elif x in ['Well']:
        return 'Well'
    elif x in ['New York']:
        return 'New York'
    else:
        return 'others'

In [78]:
data_cl['section_name'] = list(map(cleanSection, data_cl['section_name']))

In [79]:
data_cl['section_name'].value_counts(dropna=False)

Briefing                2660
World                   2497
Sports                  1717
Arts                     736
Business Day             726
Opinion                  625
U.S.                     590
others                   444
Science                  304
Books                    261
Style                    233
Food                     230
Movies                   198
Travel                   197
Technology               176
Climate                  169
Health                   156
Fashion&Style            145
Theater                  127
Magazine                 124
T Magazine               105
New York                 105
Well                     102
Obit                      99
The Learning Network      92
Name: section_name, dtype: int64

### Column "type_of_material"

In [80]:
data_cl['type_of_material'].value_counts(dropna=False)

News               8682
briefing           2757
Op-Ed               542
Review              278
Obituary (Obit)     259
Letter               66
Newsletter           48
Correction           36
Editorial            34
Schedule             31
News Analysis        30
List                 29
Quote                10
Biography             7
Question              6
An Appraisal          1
Interview             1
Text                  1
Name: type_of_material, dtype: int64

In [81]:
def cleanMaterial(x):
    if x in ['News']:
        return 'News'
    elif x in ['briefing']:
        return 'Briefing'    
    elif x in ['Interactive Feature']:
        return 'Interactive Feature' 
    elif x in ['Op-Ed']:
        return 'OpEd'   
    elif x in ['Review']:
        return 'Review'
    elif x in ['Obituary (Obit)']:
        return 'Obit'    
    else:
        return 'others'

In [82]:
data_cl['type_of_material'] = list(map(cleanMaterial, data_cl['type_of_material']))

In [83]:
data_cl['type_of_material'].value_counts(dropna=False)

News        8682
Briefing    2757
OpEd         542
others       300
Review       278
Obit         259
Name: type_of_material, dtype: int64

In [84]:
data_cl.head()

Unnamed: 0,source,headline,keyword1,keyword2,keyword3,pub_date,document_type,news_desk,section_name,type_of_material,word_count
0,NYT,Tensions Over Racism Rock Australian Netball,Australia,others,others,2022-10-30,article,Foreign,World,News,1321
1,NYT,"After Years of ‘Hell’ in ISIS Detention Camp, ...",others,others,Australia,2022-10-29,article,Foreign,World,News,910
2,NYT,How Australia Fell Behind on Data Privacy,others,others,others,2022-10-21,article,Foreign,World,News,1054
3,NYT,New Crack in Apple’s Armor as Dozens Strike at...,others,Australia,others,2022-10-18,article,Business,Business Day,News,1165
5,NYT,Rising Waters Again Force Evacuations and Spre...,Australia,others,others,2022-10-15,article,Foreign,World,News,395


## Dropround Nr. 2:

In [85]:
# Because we just kept the articles of the NYT and droppend multimedia, we can get rid of the column all together:

In [86]:
data_cl = data_cl.drop(['document_type'], axis=1)

## 2 new columns: year and month

In [87]:
data_cl['month'] = pd.DatetimeIndex(data_cl['pub_date']).month  #creating a month column from the date column
   
data_cl['year'] = pd.DatetimeIndex(data_cl['pub_date']).year  #creating a year column from the date column

In [88]:
data_cl.tail()

Unnamed: 0,source,headline,keyword1,keyword2,keyword3,pub_date,news_desk,section_name,type_of_material,word_count,month,year
14049,NYT,Understanding Rural America’s Gun Culture,others,others,,2018-03-24,others,Opinion,others,1300,3,2018
14050,NYT,"Corrections: November 28, 2017",,,,2017-11-28,others,others,others,621,11,2017
14051,NYT,"No, Mr. President, It Is ‘a Guns Situation’",others,others,others,2017-11-06,others,Opinion,others,828,11,2017
14052,NYT,A Better Way to Elect a President?,others,others,US politics and relations,2017-11-18,others,Opinion,others,1215,11,2017
14053,NYT,One Man’s Self-Imposed News ‘Blockade’,others,others,US politics and relations,2018-03-17,others,Opinion,others,1324,3,2018


In [89]:
data_cl.shape

(12818, 12)

## Changing data types?

In [90]:
print(data_cl.nunique())

source                  2
headline            12200
keyword1               11
keyword2               11
keyword3               11
pub_date             1804
news_desk              23
section_name           25
type_of_material        6
word_count           2566
month                  12
year                    6
dtype: int64


In [91]:
data_cl.dtypes

source              object
headline            object
keyword1            object
keyword2            object
keyword3            object
pub_date            object
news_desk           object
section_name        object
type_of_material    object
word_count           int64
month                int64
year                 int64
dtype: object

In [92]:
data_cl.head()

Unnamed: 0,source,headline,keyword1,keyword2,keyword3,pub_date,news_desk,section_name,type_of_material,word_count,month,year
0,NYT,Tensions Over Racism Rock Australian Netball,Australia,others,others,2022-10-30,Foreign,World,News,1321,10,2022
1,NYT,"After Years of ‘Hell’ in ISIS Detention Camp, ...",others,others,Australia,2022-10-29,Foreign,World,News,910,10,2022
2,NYT,How Australia Fell Behind on Data Privacy,others,others,others,2022-10-21,Foreign,World,News,1054,10,2022
3,NYT,New Crack in Apple’s Armor as Dozens Strike at...,others,Australia,others,2022-10-18,Business,Business Day,News,1165,10,2022
5,NYT,Rising Waters Again Force Evacuations and Spre...,Australia,others,others,2022-10-15,Foreign,World,News,395,10,2022


In [93]:
# Not necessary, all datatypes were 

## Saving df as csv: 

In [94]:
data_cl.to_csv('NYT_Australia_articles_cleaned.csv', index=False)

In [95]:
data_cl.head()

Unnamed: 0,source,headline,keyword1,keyword2,keyword3,pub_date,news_desk,section_name,type_of_material,word_count,month,year
0,NYT,Tensions Over Racism Rock Australian Netball,Australia,others,others,2022-10-30,Foreign,World,News,1321,10,2022
1,NYT,"After Years of ‘Hell’ in ISIS Detention Camp, ...",others,others,Australia,2022-10-29,Foreign,World,News,910,10,2022
2,NYT,How Australia Fell Behind on Data Privacy,others,others,others,2022-10-21,Foreign,World,News,1054,10,2022
3,NYT,New Crack in Apple’s Armor as Dozens Strike at...,others,Australia,others,2022-10-18,Business,Business Day,News,1165,10,2022
5,NYT,Rising Waters Again Force Evacuations and Spre...,Australia,others,others,2022-10-15,Foreign,World,News,395,10,2022
