# Read the input
We only need to focus on one table for now.

In [1]:
import pandas as pd
df = pd.read_csv('noahs-customers.csv')
df.head(2)

Unnamed: 0,customerid,name,address,citystatezip,birthdate,phone,timezone,lat,long
0,1001,Jacqueline Alvarez,105N Elizabeth St,"Manhattan, NY 10013",1958-01-23,315-377-5031,America/New_York,40.71817,-73.997468
1,1002,Julie Howell,185-1 Linden St,"Brooklyn, NY 11221",1956-12-03,680-537-8725,America/New_York,40.69426,-73.92167


# See what headers/labels they have

In [2]:
df.columns

Index(['customerid', 'name', 'address', 'citystatezip', 'birthdate', 'phone',
       'timezone', 'lat', 'long'],
      dtype='object')

In [3]:
', '.join([ _ for _ in df ])

'customerid, name, address, citystatezip, birthdate, phone, timezone, lat, long'

In [4]:
for line in df: print(line)

customerid
name
address
citystatezip
birthdate
phone
timezone
lat
long


# Step 1: Phone - We need phone numbers w/o hyphens

In [5]:
phones = df['phone']
phones.head()

0    315-377-5031
1    680-537-8725
2    315-846-6054
3    516-275-2292
4    838-830-6960
Name: phone, dtype: object

# Use `pandas.Series.str` for the job

In [6]:
phones.str.split('-')

0       [315, 377, 5031]
1       [680, 537, 8725]
2       [315, 846, 6054]
3       [516, 275, 2292]
4       [838, 830, 6960]
              ...       
8255    [914, 511, 7261]
8256    [516, 774, 9355]
8257    [719, 991, 9904]
8258    [516, 361, 8507]
8259    [415, 339, 0869]
Name: phone, Length: 8260, dtype: object

In [7]:
phones = phones.str.replace('-','')

# Add it as a new col in DF

In [8]:
df['phone2'] = phones
df.head(2)

Unnamed: 0,customerid,name,address,citystatezip,birthdate,phone,timezone,lat,long,phone2
0,1001,Jacqueline Alvarez,105N Elizabeth St,"Manhattan, NY 10013",1958-01-23,315-377-5031,America/New_York,40.71817,-73.997468,3153775031
1,1002,Julie Howell,185-1 Linden St,"Brooklyn, NY 11221",1956-12-03,680-537-8725,America/New_York,40.69426,-73.92167,6805378725


# Place the 2 phone-cols together

- ### First we get the individual index of each phone-col

> `columns.get_loc` is like indexOf \
> `columns[n]` ---> access by index

In [9]:
left_loc = df.columns.get_loc('phone')
left_loc, df.columns[left_loc]

(5, 'phone')

In [10]:
right_loc = df.columns.get_loc('phone2')
right_loc, df.columns[right_loc]

(9, 'phone2')

- ### Then we get a list of column-labels
> --> `.columns` returns an Index object \
 ---> `.tolist` called on the object gives you a list

In [11]:
Cols = df.columns.tolist()
Cols

['customerid',
 'name',
 'address',
 'citystatezip',
 'birthdate',
 'phone',
 'timezone',
 'lat',
 'long',
 'phone2']

### Finally, manipulate the order of columns
---> Swap the positions of two columns ('phone' and 'phone2')

In [12]:
Cols = \
Cols[:left_loc + 1] + Cols[right_loc : right_loc + 1] + Cols[left_loc + 1:right_loc]
Cols

['customerid',
 'name',
 'address',
 'citystatezip',
 'birthdate',
 'phone',
 'phone2',
 'timezone',
 'lat',
 'long']

### 👇 This syntax creates a new DataFrame of rearranged order

In [13]:
df = df[Cols]
df.head()

Unnamed: 0,customerid,name,address,citystatezip,birthdate,phone,phone2,timezone,lat,long
0,1001,Jacqueline Alvarez,105N Elizabeth St,"Manhattan, NY 10013",1958-01-23,315-377-5031,3153775031,America/New_York,40.71817,-73.997468
1,1002,Julie Howell,185-1 Linden St,"Brooklyn, NY 11221",1956-12-03,680-537-8725,6805378725,America/New_York,40.69426,-73.92167
2,1003,Christopher Ali,174-28 Baisley Blvd,"Jamaica, NY 11434",2001-09-20,315-846-6054,3158466054,America/New_York,40.68902,-73.77347
3,1004,Christopher Rodriguez,102 Mount Hope Pl,"Bronx, NY 10453",1959-07-10,516-275-2292,5162752292,America/New_York,40.84939,-73.90916
4,1005,Jeffrey Wilkinson,17 St Marks Pl,"Manhattan, NY 10003",1988-09-08,838-830-6960,8388306960,America/New_York,40.72804,-73.98712


# Step 2: Lastnames : to be extracted from full names
---> idea : `name.split()[-1]` will do

### First, we need all names

In [14]:
df['name'].str.split()

0             [Jacqueline, Alvarez]
1                   [Julie, Howell]
2                [Christopher, Ali]
3          [Christopher, Rodriguez]
4              [Jeffrey, Wilkinson]
                   ...             
8255              [Leslie, Johnson]
8256               [Susan, Gilmore]
8257    [Valerie, Michelle, Bowman]
8258                  [Sandy, Rios]
8259              [Michael, Brooks]
Name: name, Length: 8260, dtype: object

### Now under the label 'name' are list of strings

> ONE way to do this is lambda

In [15]:
df['name'].str.split().apply(lambda x: x[-1])

0         Alvarez
1          Howell
2             Ali
3       Rodriguez
4       Wilkinson
          ...    
8255      Johnson
8256      Gilmore
8257       Bowman
8258         Rios
8259       Brooks
Name: name, Length: 8260, dtype: object

> BUT we can also simply do `Series.str[-1]`

In [16]:
lastname = df['name'].str.split().str[-1]

# Step 3: Last name to 'numpad' numbers

In [17]:
def dostuff (lastname_list):
    D = {
        'abc': 2, 'def': 3, 'ghi': 4,
        'jkl': 5, 'mno': 6, 'pqrs': 7,
        'tuv': 8, 'wxyz': 9,
    }
    tel = ''
    for c in lastname_list:
        for key, val in D.items():
            if c.lower() in key:
                tel += str(val)
                break
    return tel
df['phone3']= lastname.apply( dostuff )
df.head()

Unnamed: 0,customerid,name,address,citystatezip,birthdate,phone,phone2,timezone,lat,long,phone3
0,1001,Jacqueline Alvarez,105N Elizabeth St,"Manhattan, NY 10013",1958-01-23,315-377-5031,3153775031,America/New_York,40.71817,-73.997468,2582739
1,1002,Julie Howell,185-1 Linden St,"Brooklyn, NY 11221",1956-12-03,680-537-8725,6805378725,America/New_York,40.69426,-73.92167,469355
2,1003,Christopher Ali,174-28 Baisley Blvd,"Jamaica, NY 11434",2001-09-20,315-846-6054,3158466054,America/New_York,40.68902,-73.77347,254
3,1004,Christopher Rodriguez,102 Mount Hope Pl,"Bronx, NY 10453",1959-07-10,516-275-2292,5162752292,America/New_York,40.84939,-73.90916,763744839
4,1005,Jeffrey Wilkinson,17 St Marks Pl,"Manhattan, NY 10003",1988-09-08,838-830-6960,8388306960,America/New_York,40.72804,-73.98712,945546766


### Place labels phone/phone2/phone3 together for sanity's sake

In [18]:
L = df.columns.get_loc('phone2')
R = df.columns.get_loc('phone3')
Cols = df.columns.to_list()
Cols = \
Cols[:L + 1] + Cols[R: R + 1] + Cols[L + 1:R]
df = df[Cols]
df.head()

Unnamed: 0,customerid,name,address,citystatezip,birthdate,phone,phone2,phone3,timezone,lat,long
0,1001,Jacqueline Alvarez,105N Elizabeth St,"Manhattan, NY 10013",1958-01-23,315-377-5031,3153775031,2582739,America/New_York,40.71817,-73.997468
1,1002,Julie Howell,185-1 Linden St,"Brooklyn, NY 11221",1956-12-03,680-537-8725,6805378725,469355,America/New_York,40.69426,-73.92167
2,1003,Christopher Ali,174-28 Baisley Blvd,"Jamaica, NY 11434",2001-09-20,315-846-6054,3158466054,254,America/New_York,40.68902,-73.77347
3,1004,Christopher Rodriguez,102 Mount Hope Pl,"Bronx, NY 10453",1959-07-10,516-275-2292,5162752292,763744839,America/New_York,40.84939,-73.90916
4,1005,Jeffrey Wilkinson,17 St Marks Pl,"Manhattan, NY 10003",1988-09-08,838-830-6960,8388306960,945546766,America/New_York,40.72804,-73.98712


# Finally, compare

In [19]:
df[ df['phone2'] == df['phone3'] ]

Unnamed: 0,customerid,name,address,citystatezip,birthdate,phone,phone2,phone3,timezone,lat,long
207,1208,Sam Tannenbaum,221 Banker St,"Brooklyn, NY 11222",1994-09-13,826-636-2286,8266362286,8266362286,America/New_York,40.7257,-73.9555
