# Basic Text Processing

In this notebook we will look at some basic text and data processing examples. These examples will combine lessons from multiple modules. Please follow along with the videos as I talk through each example.

In [18]:
import os
from datetime import datetime

In [19]:
cwd = os.getcwd()
data_path = os.path.join(cwd, 'bike-sharing-data', 'day.csv')
print(data_path)

C:\Users\MGaffney\Documents\LOCALAPPS\PYTHON UCI\MODULE 7\bike-sharing-data\day.csv


## Example 1: Printing the first 10 lines

In [20]:
line_counter = 0
with open(data_path) as data_file:
    for line in data_file:
        print(line)
        line_counter += 1
        if line_counter > 9:
            break

instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt

1,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985

2,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801

3,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349

4,2011-01-04,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562

5,2011-01-05,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600

6,2011-01-06,1,0,1,0,4,1,1,0.204348,0.233209,0.518261,0.0895652,88,1518,1606

7,2011-01-07,1,0,1,0,5,1,2,0.196522,0.208839,0.498696,0.168726,148,1362,1510

8,2011-01-08,1,0,1,0,6,0,2,0.165,0.162254,0.535833,0.266804,68,891,959

9,2011-01-09,1,0,1,0,0,0,1,0.138333,0.116175,0.434167,0.36195,54,768,822



## Example 2: Splitting line into the different fields and converting the dates

In [21]:
line_counter = 0
with open(data_path) as data_file:
    for line in data_file:
        if line_counter == 0:  # do this to skip the header
            pass
        else:
            line_stripped = line.strip()
            fields = line_stripped.split(',')
            dteday = fields[1]
            line_date = datetime.strptime(dteday, '%Y-%m-%d')
            print(line_date)
        line_counter += 1

2011-01-01 00:00:00
2011-01-02 00:00:00
2011-01-03 00:00:00
2011-01-04 00:00:00
2011-01-05 00:00:00
2011-01-06 00:00:00
2011-01-07 00:00:00
2011-01-08 00:00:00
2011-01-09 00:00:00
2011-01-10 00:00:00
2011-01-11 00:00:00
2011-01-12 00:00:00
2011-01-13 00:00:00
2011-01-14 00:00:00
2011-01-15 00:00:00
2011-01-16 00:00:00
2011-01-17 00:00:00
2011-01-18 00:00:00
2011-01-19 00:00:00
2011-01-20 00:00:00
2011-01-21 00:00:00
2011-01-22 00:00:00
2011-01-23 00:00:00
2011-01-24 00:00:00
2011-01-25 00:00:00
2011-01-26 00:00:00
2011-01-27 00:00:00
2011-01-28 00:00:00
2011-01-29 00:00:00
2011-01-30 00:00:00
2011-01-31 00:00:00
2011-02-01 00:00:00
2011-02-02 00:00:00
2011-02-03 00:00:00
2011-02-04 00:00:00
2011-02-05 00:00:00
2011-02-06 00:00:00
2011-02-07 00:00:00
2011-02-08 00:00:00
2011-02-09 00:00:00
2011-02-10 00:00:00
2011-02-11 00:00:00
2011-02-12 00:00:00
2011-02-13 00:00:00
2011-02-14 00:00:00
2011-02-15 00:00:00
2011-02-16 00:00:00
2011-02-17 00:00:00
2011-02-18 00:00:00
2011-02-19 00:00:00


## Example 3: Lines per month

In [22]:
line_counter = 0
month_line_counts = {}
with open(data_path) as data_file:
    for line in data_file:
        if line_counter == 0:  # do this to skip the header
            pass
        else:
            line_stripped = line.strip()
            fields = line_stripped.split(',')
            dteday = fields[1]
            line_date = datetime.strptime(dteday, '%Y-%m-%d')
            line_month = line_date.month
            if line_month not in month_line_counts:
                month_line_counts[line_month] = 1
            else: 
                month_line_counts[line_month] += 1
        line_counter += 1
        
print(month_line_counts)

{1: 62, 2: 57, 3: 62, 4: 60, 5: 62, 6: 60, 7: 62, 8: 62, 9: 60, 10: 62, 11: 60, 12: 62}


## Example 4: Bike usage per month

In [23]:
line_counter = 0
month_usage_counts = {}
with open(data_path) as data_file:
    for line in data_file:
        if line_counter == 0:  # do this to skip the header
            pass
        else:
            line_stripped = line.strip()
            fields = line_stripped.split(',')
            dteday = fields[1]
            usage_count = int(fields[-1])
            line_date = datetime.strptime(dteday, '%Y-%m-%d')
            line_month = line_date.month
            if line_month not in month_usage_counts:
                month_usage_counts[line_month] = usage_count
            else: 
                month_usage_counts[line_month] += usage_count
        line_counter += 1
        
print(month_usage_counts)

{1: 134933, 2: 151352, 3: 228920, 4: 269094, 5: 331686, 6: 346342, 7: 344948, 8: 351194, 9: 345991, 10: 322352, 11: 254831, 12: 211036}


## Example 5: Creating a file of only days with counts over 6000
read into a file and read out to file in one function

In [24]:
count_threshold = 6000
out_path = os.path.join(cwd, 'bike-sharing-data', f'days_over_{count_threshold}.csv')
line_counter = 0
with open(data_path) as data_file, open(out_path, 'w') as out_file:
    for line in data_file:
        if line_counter == 0:  # do this to skip the header
            pass
        else:
            line_stripped = line.strip()
            fields = line_stripped.split(',')
            usage_count = int(fields[-1])
            if usage_count > count_threshold:
                out_file.write(line)
                
        line_counter +=1 

## Example 6: Baby names

In [25]:
baby_name_path = os.path.join(cwd, 'baby-names', 'Most_Popular_Baby_Names__1980-2013.csv')

name_counts = {}
with open(baby_name_path) as name_file:
    name_file.readline()
    for line in name_file:
        fields = line.strip().split(',')
        name = fields[2].lower()
        number_per_year = int(fields[-1])
        if name not in name_counts:
            name_counts[name] = number_per_year
        else:
            name_counts[name] += number_per_year
        

print(name_counts)
 

{'michael': 75666, 'jason': 11849, 'christopher': 45714, 'matthew': 53598, 'david': 40415, 'james': 34705, 'robert': 23464, 'daniel': 47889, 'john': 34676, 'joseph': 38754, 'brian': 14179, 'joshua': 39732, 'ryan': 36661, 'nicholas': 36344, 'eric': 15211, 'justin': 18857, 'timothy': 10281, 'william': 27717, 'anthony': 35609, 'andrew': 37576, 'adam': 10980, 'thomas': 9541, 'kevin': 22004, 'steven': 6802, 'jeremy': 943, 'jonathan': 15672, 'brandon': 23050, 'kyle': 14432, 'jospeh': 1443, 'jonathon': 6123, 'jacob': 29856, 'zachary': 13453, 'alexander': 22304, 'tyler': 16979, 'austin': 5704, 'dylan': 2598, 'benjamin': 9672, 'ethan': 10767, 'christian': 1870, 'samuel': 3181, 'zachery': 710, 'nathan': 7568, 'jack': 3552, 'logan': 4616, 'noah': 5281, 'angel': 1081, 'jayden': 3942, 'aiden': 3609, 'jackson': 2413, 'mason': 2566, 'lucas': 1958, 'liam': 1964, 'gabriel': 477, 'elijah': 934, 'henry': 463}
