# Important Commands and Libraries

## Optimizing Code Performance On Large Datasets

### Profiling

In [None]:
import cProfile
# Usage Example
profile_string = "home_runs = calculate_runs(teams)"
cProfile.run(profile_string)

### Parallel Processing

In [None]:
# Useful Library to create pools of threads and processes
import concurrent.futures

pool_thread = concurrent.futures.ThreadPoolExecutor(max_workers = 5)
result = list(pool.map(function, args=(,)))

pool_process = concurrent.futures.ProcessPoolExecutor(max_workers = 5)
result = list(pool.map(function, args=(,)))

# Threading Library
import threading
thread = threading.Thread(target=func, args=(team,)) # CREATE A THREAD based on a function
thread.start() # START A THREAD
thread.join() # WAIT FOR THE THREAD TO FINISH
lock = threading.Lock() # CREATE A LOCK
lock.acquire() # USE/Acquire the lock - use inside a function/method before using shared resource
lock.release() # Release the lock - release after using shared resource

# Multiprocessing Library
import multiprocessing
process = multiprocessing.Process(target=function, args=(email,))
process.start()
process.join()
multiprocessing.Lock() # for shared resource eg stdout or DB

# InterProcess Communication  Using Pipe objects
parent_conn, child_conn = multiprocessing.Pipe()
child_conn.send(data) # child_conn can be kept as an argument of the target function
child_conn.close() # close connection
data = parent_conn.recv() # receive data sent by child connection

# Creating pool of processes to automatically take care of the interprocess communication
from multiprocessing import Pool
P = Pool(2) # pool of 2 workers


In [None]:
# Flushing stdout
sys.stdout.flush()

In [None]:
# equivalent of ls
os.listdir(path)

In [None]:
# a counter library
from collections import Counter

In [None]:
# Regular expressions library
import re
re.sub("\W+", " ", my_string) # '\W+' = non-alphanumeric characters 

In [None]:
# Example Code
import concurrent.futures
from collections import Counter
import os
import re

def word_frequencies(filename):
    with open(filename) as f:
        data = f.read().lower()
    data = re.sub("\W+", " ", data) 
    words = data.split(" ")
    words = [w for w in words if len(w) >= 5]
    count = Counter(words)
    return dict(count)

results = []
pool = concurrent.futures.ProcessPoolExecutor(max_workers=2)
filenames = ["lines/{}".format(f) for f in os.listdir("lines")]
word_counts = pool.map(word_frequencies, filenames)
word_counts = list(word_counts)

total_word_counts = {}

for wc in word_counts:
    for k,v in wc.items():
        if k not in total_word_counts:
            total_word_counts[k] = 0
        total_word_counts[k] += v

top_200 = Counter(total_word_counts).most_common(200)

## Pandas

In [None]:
import pandas as pd
# initializing a df
df = pd.DataFrame(data=[[1,2,3] for _ in range(3)], index='a b c'.split(), columns='d e f'.split())
df.shape # rows,columns
df.head(rows)
df.tail(rows)
df.iloc[1::2,:3] #integer based location
df.loc['a','d'] # selects one element
df.loc['a'] # selects entire row 'a'
df['e'] # selects entire column 'e'
df = pd.read_csv()
df.reset_index(inplace = True) # will not save changes to the variable by default so we set param to true

# bool masks
df[(df['e'] > 1) & (df['d'] < 5)]

# Value Counts
vc = df.value_count()
vc.to_dict() # converts to dictionary

# aggregates
df.max() # .min(), .sum(), .mean()

# memory usage
df.info(memory_usage = 'deep')
df.memory_usage(deep=True)

# Selecting dtype columns
df.select_dtype(include = ['object','float64'])

# changing datatype of a column
df[col_name] = df[col_name].astype(dtype_name)

# Downcasting column - eg from float64 to float16
df[col] = pd.to_numeric(df[col], downcast='float') # if column is not float, first use astype to convert to float

# Converting to datetime
df[col] = pd.to_datetime(df[col])

#Converting column to category datatype - similar to enumerated datatype in postgres
df[col] = df[col].astype('category')

# Null Values
df.isnull().sum()

# Reading CSV file
eg = pd.read_csv('eg.csv',parse_dates=[cols],usecols=keep_cols,chunksize=100,dtype={col1:'type',col2:'type'})
eg = pd.read_csv('eg.csv',nrows = 5) # read 5 rows only

# Combining dataframes or chunks of dataframes
pd.concat(list_of_dfs)

# Groupby
group = df.groupby(col)
group.sum()

# Augmenting with sqlite3
df.to_sql('table_name',conn, index=False, if_exists= 'append')
results_df = pd.read_sql(query,conn)

# using %%timeit to time code
%%timeit
<code>

## Sqlite3 and Postgres

In [None]:
def calculate_runs(teams):
    home_runs=[]
    q = "SELECT SUM(HR) FROM Batting WHERE teamID = ?;"
    for team in teams:
        home_runs.append(cur.execute(q,[team]).fetchone()[0])
    return home_runs

In [None]:
import sqlite3

# Create an in memory database.
memory = sqlite3.connect(':memory:')

# Connect to our disk database.
disk = sqlite3.connect('lahman2015.sqlite')

# Create a query that will read the contents of the disk database into another database.
dump = "".join(line for line in disk.iterdump())

# Run the query to copy the database from disk into memory.
memory.executescript(dump)

## Linear Regression

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Seaborn Styling example
sns.set_palette("GnBu_d")
sns.set_style('whitegrid')

# Explore data
sns.jointplot(x='column_x',y='column_y', data=DATAFRAME,kind='hex')
sns.pairplot(DATAFRAME)
# Seaborn Linear Model Plot
sns.lmplot(x='column_x',y='column_y',data=DATAFRAME)

# TRAINING the Data
X = DATAFRAME[['column_x1','column_x2',...'column_xn']] # all the numerical variables used for training
y = DATAFRAME['column_y'] # variable we are trying to predict

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

from sklearn.linear_model import LinearRegression

lm = LinearRegression()
lm.fit(X_train,y_train)
print('Coefficients: \n', lm.coef_)
prediction = lm.predict(X_test)
plt.scatter(y_test,prediction)
plt.xlabel('Y Tested')
plt.ylabel('Y Predicted')

# EVALUATING the model
from sklearn import metrics
print('MAE: ',metrics.mean_absolute_error(y_test,prediction))
print('MSE: ',metrics.mean_squared_error(y_test,prediction))
print('RMSE: ',np.sqrt(metrics.mean_squared_error(y_test,prediction)))

# Visualizing the RESIDUALS
sns.distplot(y_test-prediction,bins=50)

# Taking a look at coefficients
pd.DataFrame(lm.coef_,X.columns,columns=['Coefficient'])




## Logistic Regression

### Cleaning Data

In [None]:
# Visualizing Missing Values
sns.heatmap(DataFrame.isnull(),yticklabels=False,cbar=False,cmap='viridis')
# Countplot
sns.set_style('whitegrid')
sns.countplot(x='Column1',hue='Column2',data=DataFrame,palette='RdBu_r')
# Histogram
sns.distplot(DataFrame['Column'].dropna(),kde=False,color='darkred',bins=30)
DataFrame['Column'].hist(bins=30,color='darkred',alpha=0.7)
# Using Cufflinks
import cufflinks as cf
cf.go_offline()
DataFrame['Column'].iplot(kind='hist',bins=30,color='green')
# Box Plot
plt.figure(figsize=(12, 7))
sns.boxplot(x='Pclass',y='Age',data=train,palette='winter')
# Converting Categorical Features
column1 = pd.get_dummies(DataFrame['column1'],drop_first=True)


### Building Model 

In [None]:
# Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(DataFrame.drop('Column_y',axis=1), DataFrame['Column_y'], test_size=0.30, random_state=101)

# Training and Predicting
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)
predictions = logmodel.predict(X_test)

# Model Evaluation
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))