# Python examples from lecture 2

The following notebook contains examples of using regular expressions and loading data from CSV and JSON files with Python standard library as well as the built-in functions from numpy and pandas libraries.

## Regular Expression

In [None]:
import re
import datetime

timestr = 'Current datetime is ' + str(datetime.datetime.now())
print(timestr)

today = re.search(r'\d+-\d+-\d+', timestr)
print('Today\'s date is ' + today.group())

current_time = re.search(r'\d+:\d+', timestr)
print('Current time is ' + current_time.group())

## Parsing Apache Log File

In [None]:
line = '147.8.212.12 - - [26/Jun/2004:01:42:01 -0500] "GET /~kumar/csci5980/homeworks.html HTTP/1.1" 200 3465 "http://www-users.cs.umn.edu/~kumar/csci5980/list.html" "Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; Win 9x 4.90)"'

regex = '([\d\.]+) (\S+) (\S+) \[(.*?)\] \"(.*?)\" (\d+) (\S+) \"(.*?)\" \"(.*?)\"'

import re
fields = re.split(regex, line)
print(fields)

## Loading CSV file with Standard Python Library

In [None]:
!type vehicle.csv

In [None]:
with open('vehicle.csv','r') as f:
    column_names = f.readline().strip().split(',')
    data = []
    for line in f:
        data.append(line.strip().split(','))

In [None]:
column_names[:5]

In [None]:
data[0][:5]

## Loading CSV File (with CSV)

In [None]:
import csv

with open('vehicle.csv') as csvfile:
    lines = csv.reader(csvfile)
    data = []
    for row in lines:
        data.append(row)

column_names = data[0]
print(column_names)
print(data[1])

## Loading CSV File with Numpy

In [None]:
import numpy as np

data = np.genfromtxt('vehicle.csv',delimiter=',',skip_header=1)

In [None]:
type(data)

In [None]:
data

In [None]:
data.shape

In [None]:
data[:2,:2]         # slicing operation

In [None]:
data[:2,:2]*10      # scalar multiplication

In [None]:
data[:2,:2]-data[:2,1:3]  # element-by-element subtraction

In [None]:
1/data[:2,:2]           # element-by-element inversion

In [None]:
data[:2,:2]**0.5        # element-by-element square-root

In [None]:
np.dot(data[:2,:2],np.ones(2))

## Loading CSV File with Pandas

In [None]:
import pandas as p
data = p.read_csv('vehicle.csv',header=0)

# You can also use read_table instead of read_csv:
# data = p.read_table('vehicle.csv',sep=',',header='infer')

In [None]:
data

In [None]:
data.columns.values

In [None]:
data.index.values

In [None]:
data.shape

In [None]:
data.head()

In [None]:
data[:3][['compactness','circularity','class']]

In [None]:
vans = data[data['class']=='van']
vans.head()

In [None]:
counts = data['class'].value_counts()
counts

In [None]:
counts2 = data['class'].value_counts().sort_index()
counts2

In [None]:
%matplotlib inline

import matplotlib
counts.plot(kind='barh',rot=0)

## Loading JSON File

In [None]:
import json

f = open('cdc.json')
tweets = [json.loads(line) for line in f]
f.close()

Count the number of times each word appears in a tweet

In [None]:
from collections import defaultdict

counts = defaultdict(int)   # count values are initialized to 0
for tweet in tweets:
    words = tweet['text'].split(' ')
    for word in words:
        counts[word] += 1

Sort the terms based on frequencies and find the 10 most popular terms

In [None]:
import operator
sorted_words = sorted(counts.items(), key=operator.itemgetter(1))
sorted_words[-10:]