<a href="https://colab.research.google.com/github/neilkazimierzsheridan/data_science_course/blob/main/ds_Pandas_7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [70]:
import numpy as np
import pandas as pd

# Pandas: Text methods for string data

In [2]:
email = 'jose@gmail.com'

In [3]:
email.split('@')

['jose', 'gmail.com']

In [5]:
names = pd.Series(['andy', 'bobo', 'clare','david', '5'])
names

0     andy
1     bobo
2    clare
3    david
4        5
dtype: object

In [6]:
names.str.upper()

0     ANDY
1     BOBO
2    CLARE
3    DAVID
4        5
dtype: object

In [7]:
email.isdigit()

False

In [8]:
names.str.isdigit() #so can use to filter

0    False
1    False
2    False
3    False
4     True
dtype: bool

In [9]:
tech_finance = ['GOOG,APPL,AMZ', 'JPM,BAC,GS']

In [10]:
len(tech_finance)

2

In [11]:
tickers = pd.Series(tech_finance)
tickers #so we want to split this on the commas

0    GOOG,APPL,AMZ
1       JPM,BAC,GS
dtype: object

In [12]:
tickers.str.split(',')

0    [GOOG, APPL, AMZ]
1       [JPM, BAC, GS]
dtype: object

In [15]:
tickers.str.split(',')[0][0] #grab the first one

'GOOG'

In [16]:
#or the last one
tickers.str.split(',')[1][2]

'GS'

In [17]:
#we can return certain items after the split like this
tickers.str.split(',').str[0]

0    GOOG
1     JPM
dtype: object

In [18]:
tickers.str.split(',').str[2]

0    AMZ
1     GS
dtype: object

EXPAND

In [20]:
#make them into 3 columns - making a dataframe!
tickers.str.split(',', expand=True)

Unnamed: 0,0,1,2
0,GOOG,APPL,AMZ
1,JPM,BAC,GS


In [21]:
#stacking string calls with some messsy names
messy_names = pd.Series(['andrew  ', 'bo:bo', '   claire   '])
messy_names

0        andrew  
1           bo:bo
2       claire   
dtype: object

In [25]:
#how to clean this up
messy_names.str.replace(':','').str.strip().str.capitalize()

#replace semicolons with space, then strip spaces, then capitalize first letters

0    Andrew
1      Bobo
2    Claire
dtype: object

In [36]:
def cleanup(name): #cleanup function instead
  name = name.replace(':','')
  name = name.strip()
  name = name.capitalize()
  return name



In [37]:
clean_names = messy_names.apply(cleanup)

In [38]:
print(clean_names)

0    Andrew
1      Bobo
2    Claire
dtype: object


In [40]:
messy_names

0        andrew  
1           bo:bo
2       claire   
dtype: object

In [39]:
np.vectorize(cleanup)(messy_names) #don't forget vectorize!

array(['Andrew', 'Bobo', 'Claire'], dtype='<U6')

# Pandas: Time Methods

In [41]:
from datetime import datetime

In [42]:
myyear = 2002
mymonth = 1
mday = 1
myhour = 2
mymin = 30
mysec = 15

In [45]:
mydatetime = datetime(myyear, mymonth, mday, myhour, mymin, mysec)

In [46]:
mydatetime

datetime.datetime(2002, 1, 1, 2, 30, 15)

In [47]:
mydatetime.year

2002

In [48]:
mydatetime.minute

30

In [50]:
myser = pd.Series(['Nov 3, 1990', '2000-01-01', None])
myser

0    Nov 3, 1990
1     2000-01-01
2           None
dtype: object

to_datetime

In [51]:
pd.to_datetime(myser) #bam they are datetime (yr/m/d default)

0   1990-11-03
1   2000-01-01
2          NaT
dtype: datetime64[ns]

In [54]:
timeser = pd.to_datetime(myser)

In [55]:
timeser[0].year

1990

In [56]:
obvi_euro_date = '31-12-2000'


In [57]:
pd.to_datetime(obvi_euro_date) #oh look it's ok!

Timestamp('2000-12-31 00:00:00')

In [58]:
euro_date = '10-12-2000'

In [59]:
pd.to_datetime(euro_date) #oh dear

Timestamp('2000-10-12 00:00:00')

In [60]:
pd.to_datetime(euro_date, dayfirst=True) #so we need this

Timestamp('2000-12-10 00:00:00')

In [62]:
style_date = '12--Dec--2000' #strange date formats


In [63]:
pd.to_datetime(style_date, format='%d--%b--%Y') #tell pandas the format

Timestamp('2000-12-12 00:00:00')

In [64]:
custom_date = "12th of Dec 2000" #what about this? What will Pandas think?


In [66]:
pd.to_datetime(custom_date)

Timestamp('2000-12-12 00:00:00')

Reading in a CSV which has dates/timestamp object

In [71]:
from google.colab import files
uploaded = files.upload()

Saving RetailSales_BeerWineLiquor.csv to RetailSales_BeerWineLiquor (1).csv


In [72]:
import io
df = pd.read_csv(io.BytesIO(uploaded['RetailSales_BeerWineLiquor.csv'])) #remember can read in lots of other types too!

In [73]:
df.head()

Unnamed: 0,DATE,MRTSSM4453USN
0,1992-01-01,1509
1,1992-02-01,1541
2,1992-03-01,1597
3,1992-04-01,1675
4,1992-05-01,1822


In [74]:
df['DATE']

0      1992-01-01
1      1992-02-01
2      1992-03-01
3      1992-04-01
4      1992-05-01
          ...    
335    2019-12-01
336    2020-01-01
337    2020-02-01
338    2020-03-01
339    2020-04-01
Name: DATE, Length: 340, dtype: object

In [76]:
df['DATE'] = pd.to_datetime(df['DATE']) 
#convert to datetime object
df

Unnamed: 0,DATE,MRTSSM4453USN
0,1992-01-01,1509
1,1992-02-01,1541
2,1992-03-01,1597
3,1992-04-01,1675
4,1992-05-01,1822
...,...,...
335,2019-12-01,6630
336,2020-01-01,4388
337,2020-02-01,4533
338,2020-03-01,5562


In [77]:
df['DATE'] #now it is datetime object

0     1992-01-01
1     1992-02-01
2     1992-03-01
3     1992-04-01
4     1992-05-01
         ...    
335   2019-12-01
336   2020-01-01
337   2020-02-01
338   2020-03-01
339   2020-04-01
Name: DATE, Length: 340, dtype: datetime64[ns]

In [78]:
df['DATE'][0].year

1992

In [86]:
df

Unnamed: 0_level_0,MRTSSM4453USN
DATE,Unnamed: 1_level_1
1992-01-01,1509
1992-02-01,1541
1992-03-01,1597
1992-04-01,1675
1992-05-01,1822
...,...
2019-12-01,6630
2020-01-01,4388
2020-02-01,4533
2020-03-01,5562
