# Code Snippets

## Handy Code Snippets and Commands

### MAKE FANCY SLIDES

Run in jupyter terminal:

<code>jupyter nbconvert filename.ipynb --to slides --SlidesExporter.reveal_theme=league --SlidesExporter.theme=dark</code>

<!-- 
jupyter nbconvert filename.ipynb --to slides --SlidesExporter.reveal_theme=league --SlidesExporter.theme=dark
 -->

## CSV Revisions

NOTE: Change cells to from markdown to code if you need to regenerate datasets

In [20]:
import pandas as pd
import plotly.express as px
from sklearn import datasets as skldatasets

### Plotting 1 Datasets (Scatter Plots)

In [12]:
df = pd.read_csv('datasets/age_height_original.csv', delimiter=';')
df = df.rename(columns = {'height': 'Height', 'weight': 'Weight', 'age':'Age', 'male':'Gender'}, inplace = False)
df.loc[df["Gender"] == 1, "Gender"] = 'male'
df.loc[df["Gender"] == 0, "Gender"] = 'female'
df
# df.to_csv('datasets/age_height.csv', sep=',', index=False, columns=['Height', 'Age', 'Gender'])

Unnamed: 0,Height,Weight,Age,Gender
0,151.765,47.825606,63.0,male
1,139.700,36.485807,63.0,female
2,136.525,31.864838,65.0,female
3,156.845,53.041914,41.0,male
4,145.415,41.276872,51.0,female
...,...,...,...,...
539,145.415,31.127751,17.0,male
540,162.560,52.163080,31.0,male
541,156.210,54.062497,21.0,female
542,71.120,8.051258,0.0,male


In [13]:
df = pd.read_csv('datasets/gpa_study_hours_original.csv')
df = df.rename(columns = {'gpa': 'GPA', 'study_hours': 'Study Hours'}, inplace = False)
df.loc[df["GPA"] > 4, "GPA"] = '4'
df
# df.to_csv('datasets/gpa_study_hours.csv', sep=',', index=False, columns=['GPA', 'Study Hours'])

Unnamed: 0,GPA,Study Hours
0,4.0,10.0
1,3.8,25.0
2,3.93,45.0
3,3.4,10.0
4,3.2,4.0
...,...,...
188,3.6,24.0
189,3.7,12.0
190,3.84,15.0
191,3.8,10.0


In [14]:
df = pd.read_csv('datasets/tips_original.csv')
df = df.rename(columns = {'total_bill': 'Total Bill', 'tip': 'Tip'}, inplace = False)
df['Tip Percent'] = df['Tip'] * 100 / df['Total Bill']
df = df.round({'Tip Percent': 1})
df
# df2.to_csv('datasets/tips.csv', sep=',', index=False, columns=['Total Bill', 'Tip', 'Tip Percent'])

Unnamed: 0,Total Bill,Tip,sex,smoker,day,time,size,Tip Percent
0,16.99,1.01,Female,No,Sun,Dinner,2,5.9
1,10.34,1.66,Male,No,Sun,Dinner,3,16.1
2,21.01,3.50,Male,No,Sun,Dinner,3,16.7
3,23.68,3.31,Male,No,Sun,Dinner,2,14.0
4,24.59,3.61,Female,No,Sun,Dinner,4,14.7
...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,20.4
240,27.18,2.00,Female,Yes,Sat,Dinner,2,7.4
241,22.67,2.00,Male,Yes,Sat,Dinner,2,8.8
242,17.82,1.75,Male,No,Sat,Dinner,2,9.8


### Plotting 2 Datasets (Scatter Plots with Chart Options)

In [15]:
df = px.data.iris()
df
# df.to_csv('datasets/iris.csv', sep=',', index=False, columns=df.columns)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,species_id
0,5.1,3.5,1.4,0.2,setosa,1
1,4.9,3.0,1.4,0.2,setosa,1
2,4.7,3.2,1.3,0.2,setosa,1
3,4.6,3.1,1.5,0.2,setosa,1
4,5.0,3.6,1.4,0.2,setosa,1
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica,3
146,6.3,2.5,5.0,1.9,virginica,3
147,6.5,3.0,5.2,2.0,virginica,3
148,6.2,3.4,5.4,2.3,virginica,3


### Plotting 3 Datasets (Line Graphs)

In [16]:
df = px.data.stocks()
df
# df.to_csv('datasets/stocks_original.csv', sep=',', index=False, columns=['date', 'GOOG', 'AAPL', 'AMZN', 'FB', 'NFLX', 'MSFT'])

Unnamed: 0,date,GOOG,AAPL,AMZN,FB,NFLX,MSFT
0,2018-01-01,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
1,2018-01-08,1.018172,1.011943,1.061881,0.959968,1.053526,1.015988
2,2018-01-15,1.032008,1.019771,1.053240,0.970243,1.049860,1.020524
3,2018-01-22,1.066783,0.980057,1.140676,1.016858,1.307681,1.066561
4,2018-01-29,1.008773,0.917143,1.163374,1.018357,1.273537,1.040708
...,...,...,...,...,...,...,...
100,2019-12-02,1.216280,1.546914,1.425061,1.075997,1.463641,1.720717
101,2019-12-09,1.222821,1.572286,1.432660,1.038855,1.421496,1.752239
102,2019-12-16,1.224418,1.596800,1.453455,1.104094,1.604362,1.784896
103,2019-12-23,1.226504,1.656000,1.521226,1.113728,1.567170,1.802472


In [17]:
df = px.data.stocks()
df = df.rename(columns = {'date': 'Date'}, inplace = False)
df = df.sample(frac=1)
df
# df.to_csv('datasets/stocks-unsorted.csv', sep=',', index=False, columns=['Date', 'GOOG', 'AAPL', 'AMZN', 'FB', 'NFLX', 'MSFT'])

Unnamed: 0,Date,GOOG,AAPL,AMZN,FB,NFLX,MSFT
83,2019-08-05,1.077824,1.148514,1.470605,1.005352,1.471165,1.561515
98,2019-11-18,1.175199,1.495886,1.420278,1.064062,1.478547,1.696224
18,2018-05-07,0.996398,1.077657,1.304091,1.000749,1.554645,1.107835
16,2018-04-23,0.934515,0.927543,1.279447,0.929034,1.484642,1.086518
33,2018-08-20,1.107437,1.235200,1.550181,0.934707,1.708748,1.229164
...,...,...,...,...,...,...,...
24,2018-06-18,1.048311,1.056686,1.395830,1.079690,1.957665,1.138564
27,2018-07-09,1.078559,1.093314,1.475039,1.109553,1.884852,1.195487
54,2019-01-14,0.996398,0.896114,1.379989,0.802997,1.614839,1.221340
60,2019-02-25,1.035165,0.999829,1.360081,0.868504,1.701605,1.275995


In [18]:
df = pd.read_csv('datasets/apportionment_original.csv', thousands=',')
df = df[['Name', 'Year', 'Resident Population', 'Percent Change in Resident Population']]
df = df[df['Name'] == 'United States']
df
# df.to_csv('datasets/us_population.csv', sep=',', index=False, columns=['Name', 'Year', 'Resident Population', 'Percent Change in Resident Population'])

Unnamed: 0,Name,Year,Resident Population,Percent Change in Resident Population
56,United States,1910,92228531,21.0
113,United States,1920,106021568,15.0
170,United States,1930,123202660,16.2
227,United States,1940,132165129,7.3
284,United States,1950,151325798,14.5
341,United States,1960,179323175,18.5
398,United States,1970,203211926,13.3
455,United States,1980,226545805,11.5
512,United States,1990,248709873,9.8
569,United States,2000,281421906,13.2


In [19]:
df = pd.read_csv('datasets/apportionment_original.csv', thousands=',')
df = df[df['Geography Type'] == 'State']
df = df[['Name', 'Year', 'Number of Representatives']]
df = df.fillna(0)
df = df.rename(columns = {'Name': 'State', 'Number of Representatives': 'Representatives'}, inplace = False)
df
# df.to_csv('datasets/us_apportionment.csv', sep=',', index=False, columns=['State', 'Year', 'Representatives'])

Unnamed: 0,State,Year,Representatives
0,Alabama,1910,10.0
1,Alaska,1910,0.0
2,Arizona,1910,0.0
3,Arkansas,1910,7.0
4,California,1910,11.0
...,...,...,...
674,Virginia,2020,11.0
675,Washington,2020,10.0
676,West Virginia,2020,2.0
677,Wisconsin,2020,8.0


In [20]:
df = pd.melt(px.data.stocks(), id_vars='date', value_vars=['GOOG', 'AAPL', 'AMZN', 'FB', 'NFLX', 'MSFT'], var_name='stock', value_name='rel_change')
df
# df.to_csv('datasets/stocks.csv', sep=',', index=False, columns=df.columns)

Unnamed: 0,date,stock,rel_change
0,2018-01-01,GOOG,1.000000
1,2018-01-08,GOOG,1.018172
2,2018-01-15,GOOG,1.032008
3,2018-01-22,GOOG,1.066783
4,2018-01-29,GOOG,1.008773
...,...,...,...
625,2019-12-02,MSFT,1.720717
626,2019-12-09,MSFT,1.752239
627,2019-12-16,MSFT,1.784896
628,2019-12-23,MSFT,1.802472


In [33]:
df = skldatasets.load_diabetes(as_frame=True).data
df

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068330,-0.092204
2,0.085299,0.050680,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.025930
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641
...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018118,0.044485
439,0.041708,0.050680,-0.015906,0.017282,-0.037344,-0.013840,-0.024993,-0.011080,-0.046879,0.015491
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044528,-0.025930


In [49]:
df = px.data.medals_long().groupby(by='nation').sum()
df = df.reset_index()
df
# df.to_csv('datasets/medals_total.csv', sep=',', index=False, columns=df.columns)
# px.bar(px.data.medals_long(), x='nation', y='count')
# px.bar(df, x='nation', y='count')

### Box Plots Datasets

In [9]:
df_csv = pd.read_json('datasets/Toys_and_Games.json',lines=True)


Unnamed: 0,_id,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,category,class,label
0,{'$oid': '5a13282b741a2384e879a620'},A3C9CSW3TJITGT,5069491,Renee,"[0, 0]",I love these felt nursery rhyme characters and...,4,Charming characters but busy work required,1377561600,"08 27, 2013",Toys_and_Games,1,spam
1,{'$oid': '5a13282b741a2384e879a621'},A31POTIYCKSZ9G,76561046,So CA Teacher,"[0, 0]",I see no directions for its use. Therefore I h...,3,No directions for use...,1404864000,"07 9, 2014",Toys_and_Games,0,not spam
2,{'$oid': '5a13282b741a2384e879a622'},A2GGHHME9B6W4O,131358936,Dalilah G.,"[0, 0]",This is a great tool for any teacher using the...,5,Great CD-ROM,1382400000,"10 22, 2013",Toys_and_Games,1,spam
3,{'$oid': '5a13282b741a2384e879a61f'},AMEVO2LY6VEJA,191639,Nicole Soeder,"[0, 0]","Great product, thank you! Our son loved the pu...",5,Puzzles,1388016000,"12 26, 2013",Toys_and_Games,1,spam
4,{'$oid': '5a13282b741a2384e879a623'},A1FSLDH43ORWZP,133642984,Dayna English,"[0, 0]",Although not as streamlined as the Algebra I m...,5,Algebra II -- presentation materials,1374278400,"07 20, 2013",Toys_and_Games,1,spam


In [11]:
df = df_csv.rename(columns = {'asin': 'productID', 'overall': 'rating'}, inplace = False)
df.loc[df["class"] == 1, "label"] = 'spam'
df.loc[df["class"] == 0, "label"] = 'not spam'
subset = df.groupby('class').apply(lambda x: x.sample(5000))
subset = subset.reset_index(drop=True)
subset['reviewLength'] = subset['reviewText'].str.len()
subset
# subset.to_csv('datasets/amazon_reviews_spam_toys.csv', sep=',', index=False, columns=['reviewerID', 'reviewerName', 'productID', 'rating', 'summary', 'reviewText', 'reviewLength', 'label'])

Unnamed: 0,_id,reviewerID,productID,reviewerName,helpful,reviewText,rating,summary,unixReviewTime,reviewTime,category,class,label
0,{'$oid': '5a13282b741a2384e879a620'},A3C9CSW3TJITGT,5069491,Renee,"[0, 0]",I love these felt nursery rhyme characters and...,4,Charming characters but busy work required,1377561600,"08 27, 2013",Toys_and_Games,1,spam
1,{'$oid': '5a13282b741a2384e879a621'},A31POTIYCKSZ9G,76561046,So CA Teacher,"[0, 0]",I see no directions for its use. Therefore I h...,3,No directions for use...,1404864000,"07 9, 2014",Toys_and_Games,0,not spam
2,{'$oid': '5a13282b741a2384e879a622'},A2GGHHME9B6W4O,131358936,Dalilah G.,"[0, 0]",This is a great tool for any teacher using the...,5,Great CD-ROM,1382400000,"10 22, 2013",Toys_and_Games,1,spam
3,{'$oid': '5a13282b741a2384e879a61f'},AMEVO2LY6VEJA,191639,Nicole Soeder,"[0, 0]","Great product, thank you! Our son loved the pu...",5,Puzzles,1388016000,"12 26, 2013",Toys_and_Games,1,spam
4,{'$oid': '5a13282b741a2384e879a623'},A1FSLDH43ORWZP,133642984,Dayna English,"[0, 0]",Although not as streamlined as the Algebra I m...,5,Algebra II -- presentation materials,1374278400,"07 20, 2013",Toys_and_Games,1,spam


In [43]:
df = pd.read_csv('datasets/Movies and TV Shows.csv')
df = df.groupby('country').filter(lambda x : len(x)>3)
df = df[~df.country.str.contains(',',na=False)&~df.country.isna()]
df = df[df.type == 'Movie']
df['runtime'] = df.duration.str.replace(r'\D', '', regex=True)
df["runtime"] = pd.to_numeric(df["runtime"])
df
# df.to_csv('datasets/movies.csv', sep=',', index=False, columns=['title', 'director', 'cast', 'country', 'release_year', 'rating', 'listed_in', 'description', 'runtime'])
# x='country'
# y='runtime'
# category_orders = {x:df.groupby(x)[y].median().sort_values().index.to_list()}
# px.box(df, x=x, y=y, category_orders=category_orders)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,runtime
0,s1,Movie,The Grand Seduction,Don McKellar,"Brendan Gleeson, Taylor Kitsch, Gordon Pinsent",Canada,"March 30, 2021",2014,,113 min,"Comedy, Drama",A small fishing village must procure a local d...,113
1,s2,Movie,Take Care Good Night,Girish Joshi,"Mahesh Manjrekar, Abhay Mahajan, Sachin Khedekar",India,"March 30, 2021",2018,13+,110 min,"Drama, International",A Metro Family decides to fight a Cyber Crimin...,110
2,s3,Movie,Secrets of Deception,Josh Webber,"Tom Sizemore, Lorenzo Lamas, Robert LaSardo, R...",United States,"March 30, 2021",2017,,74 min,"Action, Drama, Suspense",After a man discovers his wife is cheating on ...,74
3,s4,Movie,Pink: Staying True,Sonia Anderson,"Interviews with: Pink, Adele, Beyoncé, Britney...",United States,"March 30, 2021",2014,,69 min,Documentary,"Pink breaks the mold once again, bringing her ...",69
4,s5,Movie,Monster Maker,Giles Foster,"Harry Dean Stanton, Kieran O'Brien, George Cos...",United Kingdom,"March 30, 2021",1989,,45 min,"Drama, Fantasy",Teenage Matt Banting wants to work with a famo...,45
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9604,s9605,Movie,Bride Wars,Gary Winick,"Kate Hudson, Anne Hathaway, Kristen Johnston, ...",United States,,2009,PG,89 min,Comedy,When a clerical error causes a clash in weddin...,89
9614,s9615,Movie,What Happens in Vegas,Tom Vaughan,"Cameron Diaz, Ashton Kutcher, Rob Corddry, ...",United States,,2008,PG-13,99 min,Comedy,Jack Fuller (Ashton Kutcher) and Joy McNally (...,99
9641,s9642,Movie,Marley & Me,David Frankel,"Owen Wilson, Jennifer Aniston",United States,,2008,PG,116 min,"Comedy, Drama, Kids",Jennifer Aniston and Owen Wilson unleash huge ...,116
9643,s9644,Movie,Love & Other Drugs,Edward Zwick,"K K, Jake Gyllenhaal, Anne Hathaway, Oliver Pl...",United States,,2010,R,112 min,"Comedy, Drama",A womanizing pfizer drug rep with no sense of ...,112
