# Using regression to find or extract text from dataframe

In [1]:
import pandas as pd 
import numpy as np 
import re

In [2]:
emp_df = pd.read_csv('data/employees_astyle.csv')
emp_df.head()

Unnamed: 0,EMPLOYEE_ID,FIRST_NAME,LAST_NAME,EMAIL,PHONE_NUMBER,HIRE_DATE,JOB_ID,SALARY,COMMISSION_PCT,MANAGER_ID,DEPARTMENT_ID
0,100,Steven,King,SKING,515.123.4567,2003-06-17 00:00:00,AD_PRES,24000,0.0,0,90
1,101,Neena,Kochhar,NKOCHHAR,515.123.4568,2005-09-21 00:00:00,AD_VP,17000,0.0,100,90
2,102,Lex,De Haan,LDEHAAN,515.123.4569,2001-01-13 00:00:00,AD_VP,17000,0.0,100,90
3,103,Alexander,Hunold,AHUNOLD,590.423.4567,2006-01-03 00:00:00,IT_PROG,9000,0.0,102,60
4,104,Bruce,Ernst,BERNST,590.423.4568,2007-05-21 00:00:00,IT_PROG,6000,0.0,103,60


In [3]:
#Finding text starting with a string, strings
#Syntax: df["col_name"].str.startswith("string")
#Finding employees have JOB_ID start with "AC"
emp_df[emp_df['JOB_ID'].str.startswith('AC')]

Unnamed: 0,EMPLOYEE_ID,FIRST_NAME,LAST_NAME,EMAIL,PHONE_NUMBER,HIRE_DATE,JOB_ID,SALARY,COMMISSION_PCT,MANAGER_ID,DEPARTMENT_ID
105,205,Shelley,Higgins,SHIGGINS,515.123.8080,2002-06-07 00:00:00,AC_MGR,12008,0.0,101,110
106,206,William,Gietz,WGIETZ,515.123.8181,2002-06-07 00:00:00,AC_ACCOUNT,8300,0.0,205,110


In [5]:
#Finding employees have JOB_ID start with "AC" and only taking 2 cols: FIRST_NAME and LAST_NAME
emp_df.loc[emp_df['JOB_ID'].str.startswith('AC'),['FIRST_NAME', 'LAST_NAME']]
# or emp_df.loc[emp_df['JOB_ID'].str.startswith('AC')][['FIRST_NAME', 'LAST_NAME']]

Unnamed: 0,FIRST_NAME,LAST_NAME
105,Shelley,Higgins
106,William,Gietz


In [6]:
#Finding employees have JOB_ID contains 'REP'
emp_df[emp_df['JOB_ID'].str.contains('REP')]

Unnamed: 0,EMPLOYEE_ID,FIRST_NAME,LAST_NAME,EMAIL,PHONE_NUMBER,HIRE_DATE,JOB_ID,SALARY,COMMISSION_PCT,MANAGER_ID,DEPARTMENT_ID
50,150,Peter,Tucker,PTUCKER,011.44.1344.129268,2005-01-30 00:00:00,SA_REP,10000,0.3,145,80
51,151,David,Bernstein,DBERNSTE,011.44.1344.345268,2005-03-24 00:00:00,SA_REP,9500,0.25,145,80
52,152,Peter,Hall,PHALL,011.44.1344.478968,2005-08-20 00:00:00,SA_REP,9000,0.25,145,80
53,153,Christopher,Olsen,COLSEN,011.44.1344.498718,2006-03-30 00:00:00,SA_REP,8000,0.2,145,80
54,154,Nanette,Cambrault,NCAMBRAU,011.44.1344.987668,2006-12-09 00:00:00,SA_REP,7500,0.2,145,80
55,155,Oliver,Tuvault,OTUVAULT,011.44.1344.486508,2007-11-23 00:00:00,SA_REP,7000,0.15,145,80
56,156,Janette,King,JKING,011.44.1345.429268,2004-01-30 00:00:00,SA_REP,10000,0.35,146,80
57,157,Patrick,Sully,PSULLY,011.44.1345.929268,2004-03-04 00:00:00,SA_REP,9500,0.35,146,80
58,158,Allan,McEwen,AMCEWEN,011.44.1345.829268,2004-08-01 00:00:00,SA_REP,9000,0.35,146,80
59,159,Lindsey,Smith,LSMITH,011.44.1345.729268,2005-03-10 00:00:00,SA_REP,8000,0.3,146,80


## Can using re library to write format of string and use search() to find within text

In [7]:
f = open('data/last_names.txt')
for line in f:
    regex = 'c.+a'
    if re.search(regex,line):
        print(line, end = " ")
f.close()

Garcia
 Richardson
 Mcdonald
 Richards
 Mcdaniel
 Strickland
 Mccarthy
 Schwartz
 Mclaughlin
 Buchanan
 Richard
 Cochran
 Acosta
 Mcclain
 Hickman
 Mclean
 Mccall
 Blanchard
 Mcmillan
 Ochoa
 Mcfarland
 Macdonald
 Mckay
 Mccarty
 Michael
 Mcgowan
 Macias
 Mccray
 Mcfadden
 Valencia
 Mcmahon
 Rocha
 Schaefer
 Lockhart
 Mccann
 Escobar
 Sinclair
 Mcgrath
 Mccabe
 Pritchard
 Mcclellan
 Mcneal
 Nicholas
 Mccain
 Ackerman
 Carmichael
 Mccauley
 Feliciano
 Mcgraw
 Mcmanus
 Mcwilliams
 Mcrae
 Mcnamara
 Mckenna
 Schafer
 Mcnair
 Michaud
 Mccracken
 Mcnally
 Bouchard
 Blackman
 Schaffer
 Mclain
 Corcoran
 Becerra
 Beckman
 Schumacher
 Merchant
 Mcclelland
 Mcmillian
 Machado
 Michaels
 Mccormack
 Schaeffer
 Mcmahan
 Mccrary
 Schwab
 Schwarz
 

In [8]:
# Finding last_name has 5 chars and start with 'M' and end with 'in'
f = open('data/last_names.txt')
for line in f:
    regex = 'M..in$'
    if re.search(regex,line):
        print(line, end = ' ')

Morin
 Marin
 

In [9]:
#Taking the 3 last numbers of emp_df['PHONE_NUMBER]
pattern = '\.(\d{3})\.'
emp_df['PHONE_NUMBER'].str.extract(pattern)

  pattern = '\.(\d{3})\.'


Unnamed: 0,0
0,123
1,123
2,123
3,423
4,423
...,...
102,123
103,123
104,123
105,123


In [11]:
# Extract a substring from the JOB_ID column of a DataFrame (emp_df) and assigns it to a new column named Job_Code.
emp_df['Job_code'] = emp_df['JOB_ID'].str.extract('(\w+)_')
emp_df.head()

  emp_df['Job_code'] = emp_df['JOB_ID'].str.extract('(\w+)_')


Unnamed: 0,EMPLOYEE_ID,FIRST_NAME,LAST_NAME,EMAIL,PHONE_NUMBER,HIRE_DATE,JOB_ID,SALARY,COMMISSION_PCT,MANAGER_ID,DEPARTMENT_ID,Job_code
0,100,Steven,King,SKING,515.123.4567,2003-06-17 00:00:00,AD_PRES,24000,0.0,0,90,AD
1,101,Neena,Kochhar,NKOCHHAR,515.123.4568,2005-09-21 00:00:00,AD_VP,17000,0.0,100,90,AD
2,102,Lex,De Haan,LDEHAAN,515.123.4569,2001-01-13 00:00:00,AD_VP,17000,0.0,100,90,AD
3,103,Alexander,Hunold,AHUNOLD,590.423.4567,2006-01-03 00:00:00,IT_PROG,9000,0.0,102,60,IT
4,104,Bruce,Ernst,BERNST,590.423.4568,2007-05-21 00:00:00,IT_PROG,6000,0.0,103,60,IT


In [12]:
# Extract hired year and hired month in the column HIRE_DATE and assign to 2 new columns
emp_df[['HIRED_YEAR', 'HIRED_MONTH']] = emp_df['HIRE_DATE'].str.extract('(\d{4})-(\d{2})')
emp_df.head()

  emp_df[['HIRED_YEAR', 'HIRED_MONTH']] = emp_df['HIRE_DATE'].str.extract('(\d{4})-(\d{2})')


Unnamed: 0,EMPLOYEE_ID,FIRST_NAME,LAST_NAME,EMAIL,PHONE_NUMBER,HIRE_DATE,JOB_ID,SALARY,COMMISSION_PCT,MANAGER_ID,DEPARTMENT_ID,Job_code,HIRED_YEAR,HIRED_MONTH
0,100,Steven,King,SKING,515.123.4567,2003-06-17 00:00:00,AD_PRES,24000,0.0,0,90,AD,2003,6
1,101,Neena,Kochhar,NKOCHHAR,515.123.4568,2005-09-21 00:00:00,AD_VP,17000,0.0,100,90,AD,2005,9
2,102,Lex,De Haan,LDEHAAN,515.123.4569,2001-01-13 00:00:00,AD_VP,17000,0.0,100,90,AD,2001,1
3,103,Alexander,Hunold,AHUNOLD,590.423.4567,2006-01-03 00:00:00,IT_PROG,9000,0.0,102,60,IT,2006,1
4,104,Bruce,Ernst,BERNST,590.423.4568,2007-05-21 00:00:00,IT_PROG,6000,0.0,103,60,IT,2007,5
