In [35]:
import pandas as pd 
import regex as re
import csv
import datetime

### Clean the dataframe of abstracts and dates, everytime read csv need to convert date to datetime and abstract to string. 

In [36]:
df=pd.read_csv('bush_csv')

In [37]:
df.sort_values(by='date', inplace=True)

In [38]:
df.reset_index(inplace=True)

In [39]:
df

Unnamed: 0,index,abstract,date
0,29432,Peter Marks analysis finds that commercials ru...,2000-01-01T05:00:00+0000
1,29474,The New York Times: The Internet and Political...,2000-01-01T05:00:00+0000
2,29443,Letter by Mike Fremont of Rivers Unlimited on ...,2000-01-02T05:00:00+0000
3,29510,Presidential primary season is the most compet...,2000-01-02T05:00:00+0000
4,29633,Editorial on various campaign proposals for us...,2000-01-02T05:00:00+0000
...,...,...,...
40816,28625,"Republicans have criticized her tweets, but De...",2021-02-25T20:56:39+0000
40817,27791,The disputes are reminiscent of the fight surr...,2021-02-26T00:12:38+0000
40818,28139,Most presidents leave the White House and adop...,2021-02-27T17:00:07+0000
40819,28504,"Democracy, an unassuming policy journal with a...",2021-02-28T22:02:50+0000


#### Convert all abstracts to floats

In [40]:
df['abstract']=df['abstract'].astype(str)

In [41]:
df.iloc[2][1]

"Letter by Mike Fremont of Rivers Unlimited on gains from cleaning up nation's rivers and making their corridors scenic (S)"

#### Remove (M) and (S) from abstract

In [42]:
df['abstract']=df['abstract'].apply(lambda x: re.sub(r"\(.*\)", "", x))

In [43]:
df['abstract'][7]

' INTERNATIONAL   A3-10    War Is Heating Up On Second Chechnya Front  While attention has been focused on the attempt by Russian troops to take the Chechen capital from Islamic militants, fighting has been growing near the mountains of southern Chechnya.   A1    Israel-Syria Talks to Begin  Israeli and Syrian leaders enter knottier terrain today as they resume American-brokered negotiations ambitiously aimed at reaching a peace accord by the summer. American, Syrian and Israeli officials have been careful to dampen expectations that the talks would produce anything concrete.   A1'

#### Remove "backslash" from text - only representation of strong, not actual

In [44]:
df['abstract']=df['abstract'].apply(lambda x: x.replace('\\',''))

In [45]:
df['abstract'][8311] #### backslash represents apostrophe, but doesn't show up in print, see below 

"William B Gould IV Op-Ed article urges final-offer arbitration as alternative to Taft-Hartley Act procedures in face of labor disputes that threaten national health or safety; say that while Pres Bush's invoking of Taft-Hartley in West Coast longshore workers' lockout may be smart legal strategy, it is fundamentally flawed as labor policy, inflaming labor-management passions and dissipating management's willingness to compromise"

In [46]:
print(df['abstract'][8311])

William B Gould IV Op-Ed article urges final-offer arbitration as alternative to Taft-Hartley Act procedures in face of labor disputes that threaten national health or safety; say that while Pres Bush's invoking of Taft-Hartley in West Coast longshore workers' lockout may be smart legal strategy, it is fundamentally flawed as labor policy, inflaming labor-management passions and dissipating management's willingness to compromise


#### Try and identify jibberish abstracts (extra long etc)

In [47]:
df['abstract'].str.len().sort_values(ascending=False).head(30)

20431    22448
14439    13618
1738      8397
1193      7605
853       7316
1901      7216
1147      7040
1623      6869
18370     6376
1755      6344
14387     6140
14299     5856
14517     4032
17565     2876
5292      2506
6260      2390
7900      2230
1407      1924
374       1868
1935      1828
7913      1814
7672      1806
12063     1778
18574     1765
629       1726
1810      1725
9421      1718
17364     1718
8249      1688
905       1687
Name: abstract, dtype: int64

In [48]:
df.iloc[20431][1]

'Following is the transcript of President George W. Bush\'s news conference on Iraq, as provided by CQ Transcriptions, Inc.BUSH: Thank you all very much. I\'m going to spend a little more time on my opening comments than I usually do, but I\'ll save plenty of time for questions. Over the past three years, I have often addressed the American people to explain developments in Iraq. Some of these developments were encouraging, such as the capture of Saddam Hussein, the elections in which 12 million Iraqis defied the terrorists and voted for a free future, and the demise of the brutal terrorist Zarqawi. Other developments were not encouraging, such as the bombing of the U.N. headquarters in Baghdad, the fact that we did not find stockpiles of weapons of mass destruction, and the continued loss of some of America\'s finest sons and daughters. Recently, American and Iraqi forces have launched some of the most aggressive operations on enemy forces in Baghdad since the war began. They have cle

In [49]:
df.iloc[14439][1]

" Yes, He Said That, But What He Really Meant Was . . .       This fall and especially after the debates, The Times has analyzed the accuracy of statements made by the presidential candidates. Here are highlights of that coverage on some of the most important issues of the campaign.    IRAQ       More than a dozen times in the first debate on Oct. 1, President Bush accused Senator John Kerry of continually shifting positions on Iraq.     A review of Mr. Kerry's public statements found that his position had actually been quite consistent. But as the politics changed, Mr. Kerry repeatedly changed his emphasis. News accounts reflected what he was emphasizing at the time. And Mr. Kerry was often unclear in expressing his views.     On Oct. 9, 2002, Mr. Kerry was planning to run for president, but had not yet announced his candidacy. Before he voted to give Mr. Bush the authority to use force against Saddam Hussein in Iraq, he declared on the Senate floor: ''I will support a multilateral ef

In [50]:
mask=(df['abstract'].str.len()>1200)

In [51]:
df[mask]['abstract'].to_list()

["Long ago, political parties had a heavy hand in picking their presidential nominees. Now, in most big states except New York, voters do the picking in primaries or political caucuses. Through an arcane system that requires even candidates who have qualified nationally for federal election funds to amass signatures and jump through qualifying hoops, New York's political parties have, by mutual legislative agreement, limited the candidates on their ballots to two Democrats, Vice President Al Gore and former Senator Bill Bradley, and as few as one Republican in some areas, Gov. George W. Bush of Texas.    The Democrats have made it easier for candidates' names to appear on their ballot, requiring  a campaign to get only 5,000 valid signatures statewide, as opposed to about 20,000 for the Republicans.  But even the lesser requirement is onerous, undemocratic and outdated. New York's entire  ballot-qualification exercise ranks as a political throwback to the old party-boss days before vot

#### Remove punctuation 

In [52]:
df['abstract']=df['abstract'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

In [53]:
df['abstract'][8311]

'William B Gould IV OpEd article urges finaloffer arbitration as alternative to TaftHartley Act procedures in face of labor disputes that threaten national health or safety say that while Pres Bushs invoking of TaftHartley in West Coast longshore workers lockout may be smart legal strategy it is fundamentally flawed as labor policy inflaming labormanagement passions and dissipating managements willingness to compromise'

#### Remove extra spaces in between and at the end/beginning

In [54]:
df['abstract']=df['abstract'].apply(lambda x: x.replace("\\s{2,}"," "))

In [177]:
##df.drop('abstrace',axis=1, inplace=True)

In [55]:
df

Unnamed: 0,index,abstract,date
0,29432,Peter Marks analysis finds that commercials ru...,2000-01-01T05:00:00+0000
1,29474,The New York Times The Internet and Political ...,2000-01-01T05:00:00+0000
2,29443,Letter by Mike Fremont of Rivers Unlimited on ...,2000-01-02T05:00:00+0000
3,29510,Presidential primary season is the most compet...,2000-01-02T05:00:00+0000
4,29633,Editorial on various campaign proposals for us...,2000-01-02T05:00:00+0000
...,...,...,...
40816,28625,Republicans have criticized her tweets but Dem...,2021-02-25T20:56:39+0000
40817,27791,The disputes are reminiscent of the fight surr...,2021-02-26T00:12:38+0000
40818,28139,Most presidents leave the White House and adop...,2021-02-27T17:00:07+0000
40819,28504,Democracy an unassuming policy journal with an...,2021-02-28T22:02:50+0000


In [56]:
df['abstract']=df['abstract'].apply(lambda x: x.strip())

In [57]:
df['abstract'].to_list()

['Peter Marks analysis finds that commercials run in presidential campaign so far are remarkably tame the candidates apparently unwilling to be the first to go negative',
 'The New York Times The Internet and Political Campaigns',
 'Letter by Mike Fremont of Rivers Unlimited on gains from cleaning up nations rivers and making their corridors scenic',
 'Presidential primary season is the most competitive in 40 years and it could also be the shortest because several states have moved up their primaries in order to exercise greater sway map with primary and caucus schedules photos',
 'Editorial on various campaign proposals for using federal budget surplus warns that if current spending levels are not cut this year and rise at rate of inflation tenyear surplus shrinks to 250 billion or even less recalls enduring lesson of Reagan years that it really does take smoke and mirrors to produce tax cuts spending initiatives and balanced budget all at the same time',
 'A sea change has taken plac

#### Convert date to datetime, do everytime load csv to pandas

In [58]:
type(df['date'][0])

str

In [59]:
df['date']=pd.to_datetime(df['date'])

In [60]:
df

Unnamed: 0,index,abstract,date
0,29432,Peter Marks analysis finds that commercials ru...,2000-01-01 05:00:00+00:00
1,29474,The New York Times The Internet and Political ...,2000-01-01 05:00:00+00:00
2,29443,Letter by Mike Fremont of Rivers Unlimited on ...,2000-01-02 05:00:00+00:00
3,29510,Presidential primary season is the most compet...,2000-01-02 05:00:00+00:00
4,29633,Editorial on various campaign proposals for us...,2000-01-02 05:00:00+00:00
...,...,...,...
40816,28625,Republicans have criticized her tweets but Dem...,2021-02-25 20:56:39+00:00
40817,27791,The disputes are reminiscent of the fight surr...,2021-02-26 00:12:38+00:00
40818,28139,Most presidents leave the White House and adop...,2021-02-27 17:00:07+00:00
40819,28504,Democracy an unassuming policy journal with an...,2021-02-28 22:02:50+00:00


#### Remove Numbers

In [61]:
df['abstract']=df['abstract'].apply(lambda x: re.sub(r'\d+', '', x))

#### Change to lowercase 

In [62]:
df['abstract']=df['abstract'].apply(lambda x: x.lower())

#### Drop index

In [63]:
df.drop('index',axis=1, inplace=True)

In [64]:
df

Unnamed: 0,abstract,date
0,peter marks analysis finds that commercials ru...,2000-01-01 05:00:00+00:00
1,the new york times the internet and political ...,2000-01-01 05:00:00+00:00
2,letter by mike fremont of rivers unlimited on ...,2000-01-02 05:00:00+00:00
3,presidential primary season is the most compet...,2000-01-02 05:00:00+00:00
4,editorial on various campaign proposals for us...,2000-01-02 05:00:00+00:00
...,...,...
40816,republicans have criticized her tweets but dem...,2021-02-25 20:56:39+00:00
40817,the disputes are reminiscent of the fight surr...,2021-02-26 00:12:38+00:00
40818,most presidents leave the white house and adop...,2021-02-27 17:00:07+00:00
40819,democracy an unassuming policy journal with an...,2021-02-28 22:02:50+00:00


In [31]:
df['date'][0].year

2000

### Save dataframe as csv 

In [65]:
df.to_csv('clean_abstract_bush.csv',index=False,header=True)

In [66]:
df=pd.read_csv('clean_abstract_bush.csv')

In [67]:
df

Unnamed: 0,abstract,date
0,peter marks analysis finds that commercials ru...,2000-01-01 05:00:00+00:00
1,the new york times the internet and political ...,2000-01-01 05:00:00+00:00
2,letter by mike fremont of rivers unlimited on ...,2000-01-02 05:00:00+00:00
3,presidential primary season is the most compet...,2000-01-02 05:00:00+00:00
4,editorial on various campaign proposals for us...,2000-01-02 05:00:00+00:00
...,...,...
40816,republicans have criticized her tweets but dem...,2021-02-25 20:56:39+00:00
40817,the disputes are reminiscent of the fight surr...,2021-02-26 00:12:38+00:00
40818,most presidents leave the white house and adop...,2021-02-27 17:00:07+00:00
40819,democracy an unassuming policy journal with an...,2021-02-28 22:02:50+00:00


In [227]:
type(df['date'][0])

str