# DH199 Treaty Council Corpus Data Cleaning

### Import Packages and Data

In [30]:
import pandas as pd
import numpy as np
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import datetime
# first read your tweet data csv in as a pandas dataframe
df = pd.read_csv("treaties-speaker.csv", encoding='utf-8', sep = ',')

### Giviing Sentiment Score

In [31]:
# Reference: DH199 Course Material, Prof. Sanders Garcia
# define a function analyze_sentiment for your dataframe
def analyze_sentiment(df):
    #initialize empty list 
    sentiments = []
    #initialize sentiment analyzer
    sid = SentimentIntensityAnalyzer()
    
    # give scores for each row
    # use df.shape[0], which count the number of rows.
    for i in range(df.shape[0]):
        # score each line of text with polarity scores
        line = df['Text'].iloc[i]
        sentiment = sid.polarity_scores(line)
        # append sentiment scores for each line to new columns that are labeled according to the sentiment score labels from vader
        sentiments.append([sentiment['neg'], sentiment['pos'], sentiment['neu'], sentiment['compound']])
    # add this data to the data frame
    df[['neg', 'pos', 'neu', 'compound']] = pd.DataFrame(sentiments)
    #give each tweet an overall positive/negative label based on compound score thresholds
    df['Negative'] = df['compound'] < -0.1
    df['Positive'] = df['compound'] > 0.1
    return df

df = analyze_sentiment(df)
df.head()

Unnamed: 0,Document,Treaty Name,Council Start Date,Council End Date,Council City,Council State,Speaker,Position,Native | Non-Native,Tribe or Ethnicity,Text,neg,pos,neu,compound,Negative,Positive
0,1768.txt,Treaty of Fort Stanwix,1768-10-24,1768-11-05,Rome,NY,Canaghquieson,Chief,Native,Oneida,At a Congress with the several Nations on Tues...,0.036,0.155,0.809,0.9989,False,True
1,1768.txt,Treaty of Fort Stanwix,1768-10-24,1768-11-05,Rome,NY,Abraham,Chief,Native,Mohawk,Then Abraham Chief of the Mohawks after repeat...,0.022,0.158,0.82,0.9706,False,True
2,1768.txt,Treaty of Fort Stanwix,1768-10-24,1768-11-05,Rome,NY,Canaghquieson,Chief,Native,Oneida,Then Canaghquieson addressed the whole and des...,0.0,0.106,0.894,0.296,False,True
3,1768.txt,Treaty of Fort Stanwix,1768-10-24,1768-11-05,Rome,NY,Native Speaker,,Native,,At a Congress with the several Nations on Frid...,0.062,0.056,0.882,-0.3327,True,False
4,1768.txt,Treaty of Fort Stanwix,1768-10-24,1768-11-05,Rome,NY,Native Speaker,,Native,,"Brother, We have hearkened to all that you hav...",0.081,0.103,0.817,0.846,False,True


### Format Datetime

In [32]:
# after manaully put 0000-00-00 for no date
# and copy the items only have start date
df=df[df["Council Start Date"]!='0000-00-00']

# change the df's dates to datetime objects
df["Council Start Date"]=df["Council Start Date"].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d"))
df["Council End Date"]=df["Council End Date"].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d"))

# seperate starting date 
df["start_year"]=df["Council Start Date"].apply(lambda x: int(x.year))
df["start_month"]=df["Council Start Date"].apply(lambda x: int(x.month))
df["start_day"]=df["Council Start Date"].apply(lambda x: int(x.day))

# seperate end date 
df["end_year"]=df["Council End Date"].apply(lambda x: int(x.year))
df["end_month"]=df["Council End Date"].apply(lambda x: int(x.month))
df["end_day"]=df["Council End Date"].apply(lambda x: int(x.day))

df.head()

Unnamed: 0,Document,Treaty Name,Council Start Date,Council End Date,Council City,Council State,Speaker,Position,Native | Non-Native,Tribe or Ethnicity,...,neu,compound,Negative,Positive,start_year,start_month,start_day,end_year,end_month,end_day
0,1768.txt,Treaty of Fort Stanwix,1768-10-24,1768-11-05,Rome,NY,Canaghquieson,Chief,Native,Oneida,...,0.809,0.9989,False,True,1768,10,24,1768,11,5
1,1768.txt,Treaty of Fort Stanwix,1768-10-24,1768-11-05,Rome,NY,Abraham,Chief,Native,Mohawk,...,0.82,0.9706,False,True,1768,10,24,1768,11,5
2,1768.txt,Treaty of Fort Stanwix,1768-10-24,1768-11-05,Rome,NY,Canaghquieson,Chief,Native,Oneida,...,0.894,0.296,False,True,1768,10,24,1768,11,5
3,1768.txt,Treaty of Fort Stanwix,1768-10-24,1768-11-05,Rome,NY,Native Speaker,,Native,,...,0.882,-0.3327,True,False,1768,10,24,1768,11,5
4,1768.txt,Treaty of Fort Stanwix,1768-10-24,1768-11-05,Rome,NY,Native Speaker,,Native,,...,0.817,0.846,False,True,1768,10,24,1768,11,5


### Explore and Clean Data for Typo Correction

In [7]:
### Check Uniqueness
df["Document"].unique()

array(['1768.txt', '1779-09-26.txt', '1781-02-25.txt', '1781-03-09.txt',
       '1781-03-11.txt', '1781-10-29.txt', '1782-04-22.txt',
       '1778-06-14.txt', '1778-06-29.txt', '1778-09-24.txt',
       '1786-11-18.txt', '1788-07-18.txt', nan, '1786-09-19',
       '1783-04-19.txt', '1783-06-28.txt', '1783-09-06.txt',
       '1754-06-21.txt', '(Unknown date) Indian Council.txt',
       '1773-08-18.txt', '1774-05-13.txt', '1774-10-14.txt',
       '1792-09-30.txt', '1794-12-11.txt', '1805-05-23.txt',
       '1794-10-25.txt', '1805-06-08.txt', '1812-11-06.txt',
       '1814-10-18.txt', '1814-10-28.txt', '1778-09-27.txt',
       '1779-07-02.txt', '1782-02-25.txt', '1786-12-24.txt',
       '1783-08-14.txt'], dtype=object)

In [6]:
df["Native | Non-Native"].unique()

array(['Native', 'Native ', 'Non-Native'], dtype=object)

In [35]:
### Data Cleaning
df.replace('Native ','Native',inplace=True)
df["Native | Non-Native"].unique()

array(['Native', 'Non-Native'], dtype=object)

In [36]:
# make a list of all the (unique) nations
nations = list(df["Tribe or Ethnicity"].unique())
nations

['Oneida',
 'Mohawk',
 nan,
 'Aghquessaine',
 'Chauvrons',
 'Wea',
 'Miamis',
 'Potawatomi',
 'Ottawas',
 'Chippewa',
 'Seneca',
 'Scioto',
 'Delawares',
 'the Hurons, Delawares, Shawnee, Ottowas, Chippawas, Poutawattamies, Twightwees, Cherokees, and the Wabash Confederates ',
 'Shawnee',
 'Mingo',
 'Wyandott',
 'Kickapoo',
 'Mohegan',
 'Shawanese',
 'Saakies',
 'Ottawa',
 'Sawkies',
 'Ottowa',
 'Piankeshaw',
 'Chickasaws',
 'British',
 'Miami',
 'Virginia',
 'American']

In [37]:
# Get rid of nan and the group of nations
nations.pop(2)
nations.pop(12)
nations

['Oneida',
 'Mohawk',
 'Aghquessaine',
 'Chauvrons',
 'Wea',
 'Miamis',
 'Potawatomi',
 'Ottawas',
 'Chippewa',
 'Seneca',
 'Scioto',
 'Delawares',
 'Shawnee',
 'Mingo',
 'Wyandott',
 'Kickapoo',
 'Mohegan',
 'Shawanese',
 'Saakies',
 'Ottawa',
 'Sawkies',
 'Ottowa',
 'Piankeshaw',
 'Chickasaws',
 'British',
 'Miami',
 'Virginia',
 'American']

In [38]:
# count the number of records for each item in the nation's list
for i in nations:
    print("The number of record for", i, "is", len(df[df["Tribe or Ethnicity"]==i]))

The number of record for Oneida is 5
The number of record for Mohawk is 3
The number of record for Aghquessaine is 1
The number of record for Chauvrons is 1
The number of record for Wea is 6
The number of record for Miamis is 4
The number of record for Potawatomi is 6
The number of record for Ottawas is 1
The number of record for Chippewa is 2
The number of record for Seneca is 5
The number of record for Scioto is 1
The number of record for Delawares is 8
The number of record for Shawnee is 4
The number of record for Mingo is 1
The number of record for Wyandott is 1
The number of record for Kickapoo is 2
The number of record for Mohegan is 1
The number of record for Shawanese is 1
The number of record for Saakies is 1
The number of record for Ottawa is 1
The number of record for Sawkies is 1
The number of record for Ottowa is 1
The number of record for Piankeshaw is 1
The number of record for Chickasaws is 1
The number of record for British is 33
The number of record for Miami is 2
The

In [40]:
### Data Cleaning
# some nation names are different in their forms
# we combine the same nations by changing the original data frame
# note that Wea and Miamis are different nations
df["Tribe or Ethnicity"].replace('Ottawa','Ottawas',inplace=True)
df["Tribe or Ethnicity"].replace('Ottowa','Ottawas',inplace=True)
df["Tribe or Ethnicity"].replace('Shawnee','Shawanese',inplace=True)
df["Tribe or Ethnicity"].replace('Saakies','Sawkies',inplace=True)
df["Tribe or Ethnicity"].replace('Chippewa','Chickasaws',inplace=True)
df["Tribe or Ethnicity"].replace('Miami','Miamis',inplace=True)
# count the number of records for each item in the nation's list
for i in nations:
    print("The number of record for", i, "is", len(df[df["Tribe or Ethnicity"]==i]))

The number of record for Oneida is 5
The number of record for Mohawk is 3
The number of record for Aghquessaine is 1
The number of record for Chauvrons is 1
The number of record for Wea is 6
The number of record for Miamis is 6
The number of record for Potawatomi is 6
The number of record for Ottawas is 3
The number of record for Chippewa is 0
The number of record for Seneca is 5
The number of record for Scioto is 1
The number of record for Delawares is 8
The number of record for Shawnee is 0
The number of record for Mingo is 1
The number of record for Wyandott is 1
The number of record for Kickapoo is 2
The number of record for Mohegan is 1
The number of record for Shawanese is 5
The number of record for Saakies is 0
The number of record for Ottawa is 0
The number of record for Sawkies is 2
The number of record for Ottowa is 0
The number of record for Piankeshaw is 1
The number of record for Chickasaws is 3
The number of record for British is 33
The number of record for Miami is 0
The

### Export Data

In [41]:
### Export Data
df.to_csv('treaties_datetime_sentiments_cleaned.csv')