# Cleaning New York Times data by removing stopwords and unwanted characters. 

# By Meghana

In [34]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
import nltk 
import string
import re

In [35]:
def load_data():
    col = ['articles','id']
    data = pd.read_csv('NYfinalsportsresult.txt',header=None,sep='\t',names=col)
    print(data)
    return data

In [36]:
NYT_df = load_data()
NYT_df.head(1000)

                                             articles  id
0   AdvertisementSupported byArticle of the DayBy ... NaN
1                                By MICHAEL KIMMELMAN NaN
2                                       AUG. 26, 2018 NaN
3                                By MICHAEL KIMMELMAN NaN
4                     Illustrations by ANJALI SINGHVI NaN
5                                       AUG. 26, 2018 NaN
6   In the mid-1970s, the clouds parted for Slew H... NaN
7                                       By JOE WARD,  NaN
8                                    BEDEL SAGET and  NaN
9                                     GEOFF MACDONALD NaN
10                                      SEPT. 1, 2018 NaN
11  Novak Djokovic has one of the best return game... NaN
12                               By WM. FERGUSON and  NaN
13                                        NICK VEASEY NaN
14                                       FEB. 1, 2018 NaN
15  How equipment changes turned the players from ... NaN
16          X-

Unnamed: 0,articles,id
0,AdvertisementSupported byArticle of the DayBy ...,
1,By MICHAEL KIMMELMAN,
2,"AUG. 26, 2018",
3,By MICHAEL KIMMELMAN,
4,Illustrations by ANJALI SINGHVI,
5,"AUG. 26, 2018",
6,"In the mid-1970s, the clouds parted for Slew H...",
7,"By JOE WARD,",
8,BEDEL SAGET and,
9,GEOFF MACDONALD,


In [37]:
df  = pd.DataFrame(NYT_df[['articles', 'id']])

In [38]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [39]:
df['articles'] = df['articles'].apply(lambda x: re.sub('[!@#$:).;—,?&_]', '', x.lower()))
df['articles'] = df['articles'].apply(lambda x: re.sub('[0-9]+', '', x.lower()))
df['articles'] = df['articles'].apply(lambda x: re.sub('  ', ' ', x))
df['articles'].head(1000)

0     advertisementsupported byarticle of the dayby ...
1                                  by michael kimmelman
2                                                  aug 
3                                  by michael kimmelman
4                       illustrations by anjali singhvi
5                                                  aug 
6     in the mid-s the clouds parted for slew hester...
7                                          by joe ward 
8                                      bedel saget and 
9                                       geoff macdonald
10                                                sept 
11    novak djokovic has one of the best return game...
12                                  by wm ferguson and 
13                                          nick veasey
14                                                 feb 
15    how equipment changes turned the players from ...
16             x-rays by nick veaseytext by wm ferguson
17                                              

In [40]:
def remove_punct(text):
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    return text

df['articles'] = df['articles'].apply(lambda x: remove_punct(x))
df['articles'].head(100)

0     advertisementsupported byarticle of the dayby ...
1                                  by michael kimmelman
2                                                  aug 
3                                  by michael kimmelman
4                       illustrations by anjali singhvi
5                                                  aug 
6     in the mids the clouds parted for slew hester ...
7                                          by joe ward 
8                                      bedel saget and 
9                                       geoff macdonald
10                                                sept 
11    novak djokovic has one of the best return game...
12                                  by wm ferguson and 
13                                          nick veasey
14                                                 feb 
15    how equipment changes turned the players from ...
16              xrays by nick veaseytext by wm ferguson
17                                              

In [41]:
df["token"] = df["articles"].apply(nltk.word_tokenize)

In [42]:
df["token"] = df.apply(lambda row: nltk.word_tokenize(row["articles"]), axis=1)
df.head(100)

Unnamed: 0,articles,id,token
0,advertisementsupported byarticle of the dayby ...,,"[advertisementsupported, byarticle, of, the, d..."
1,by michael kimmelman,,"[by, michael, kimmelman]"
2,aug,,[aug]
3,by michael kimmelman,,"[by, michael, kimmelman]"
4,illustrations by anjali singhvi,,"[illustrations, by, anjali, singhvi]"
5,aug,,[aug]
6,in the mids the clouds parted for slew hester ...,,"[in, the, mids, the, clouds, parted, for, slew..."
7,by joe ward,,"[by, joe, ward]"
8,bedel saget and,,"[bedel, saget, and]"
9,geoff macdonald,,"[geoff, macdonald]"


In [43]:
import nltk
from nltk.corpus import stopwords
set(stopwords.words('english'))



stop = stopwords.words('english')
print(stop)



['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [44]:
df['Tokenstopwords']  =  df['token'].apply(lambda x: [item for item in x if item not in stop])
df['Tokenstopwords'].head(100)


0     [advertisementsupported, byarticle, dayby, jer...
1                                  [michael, kimmelman]
2                                                 [aug]
3                                  [michael, kimmelman]
4                      [illustrations, anjali, singhvi]
5                                                 [aug]
6     [mids, clouds, parted, slew, hester, president...
7                                           [joe, ward]
8                                        [bedel, saget]
9                                    [geoff, macdonald]
10                                               [sept]
11    [novak, djokovic, one, best, return, games, te...
12                                       [wm, ferguson]
13                                       [nick, veasey]
14                                                [feb]
15    [equipment, changes, turned, players, artists,...
16              [xrays, nick, veaseytext, wm, ferguson]
17                                              

In [45]:
df['articlestopwords'] = df['articles'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
df['articlestopwords'].head(100)

0     advertisementsupported byarticle dayby jeremy ...
1                                     michael kimmelman
2                                                   aug
3                                     michael kimmelman
4                          illustrations anjali singhvi
5                                                   aug
6     mids clouds parted slew hester president unite...
7                                              joe ward
8                                           bedel saget
9                                       geoff macdonald
10                                                 sept
11    novak djokovic one best return games tennishe ...
12                                          wm ferguson
13                                          nick veasey
14                                                  feb
15    equipment changes turned players artists techn...
16                    xrays nick veaseytext wm ferguson
17                                              

In [46]:
freq = pd.Series(' '.join(df['articlestopwords']).split()).value_counts()[:10]
freq

said       3263
players    1403
first      1362
one        1322
woods      1272
team       1265
game       1016
last        979
like        977
would       940
dtype: int64

In [47]:
df['articlestopwords'].to_csv("cleanArticleNonToken.txt")
df['Tokenstopwords'].to_csv("cleanArticleToken.txt")

  """Entry point for launching an IPython kernel.
  
