In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import OneHotEncoder
%matplotlib inline
from faker import Faker
# Plotly and Cufflinks set up
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.tools as tls
import cufflinks as cf
from plotly.offline import download_plotlyjs,init_notebook_mode,plot,iplot
init_notebook_mode(connected=True)
cf.go_offline()

import warnings
warnings.filterwarnings('ignore')

import os
os.chdir ('C:\\Users\\mkorzec\Downloads')

from datetime import datetime

In [2]:
# Create fake data using Faker
fake_data = Faker()

In [3]:
def create_names(n):
    name = []
    for _ in range(0, n):
        name.append(fake_data.name())
    return name

In [4]:
name = create_names(200)

In [5]:
def create_country(n):
    nation = []
    for _ in range(0, n):
        nation.append(fake_data.country())
    return nation

In [6]:
country = create_country(200)

In [7]:
age = np.random.randint(21, 99, size = 200, dtype = 'int')

In [8]:
def create_date(n):
    member_since = []
    for _ in range(0, n):
        member_since.append(fake_data.date_this_century())
    return member_since

In [9]:
date = create_date(200)

In [10]:
def create_job(n):
    job = []
    for _ in range(0, n):
        job.append(fake_data.job())
    return job

In [11]:
occupation = create_job(200)

In [12]:
def create_credit(n):
    card = []
    for _ in range(0, n):
        card.append(fake_data.credit_card_provider())
    return card

In [13]:
card = create_credit(200)

In [14]:
def create_location(n):
    location = []
    for _ in range(0, n):
        location.append(fake_data.local_latlng(country_code = "US"))
    return location

In [15]:
location = create_location(200)

In [16]:
married = np.random.randint(0,2, size = 200)

In [17]:
def create_race(n):
    race = []
    for _ in range(0, n):
        race.append(fake_data.random_element(elements = ("White", "Hispanic", "Black", "Asian")))
    return race

In [18]:
races = create_race(200)

In [19]:
salary = np.random.randint(10000, 120000, size = 200, dtype = 'int')

In [20]:
savings = np.random.randint(0, 50000, size = 200, dtype = 'int')

In [21]:
rent = np.random.randint(500, 3001, size = 200, dtype = 'int')

In [22]:
data = pd.DataFrame(list(zip(name, country, age, date, occupation, card, location, married,
                            races, salary, savings, rent)),
                   columns = ['Name', 'Place_of_Birth', 'Age', 'Member_Since', 'Job',
                              'Card', 'Location', 'Married', 'Race', 'Salary', 'Savings',
                              'Rent'])

In [71]:
# Create a date oriented dataset
datelist = pd.date_range('2010-01-01', periods = 200)
sales = np.random.randint(100, 1100, size = 200, dtype = 'int')
profit = np.random.uniform(10000, 1000000, size = 200)

data2 = pd.DataFrame(list(zip(datelist, sales, profit)),
                   columns = ['Date', 'Sales', 'Profit'])

In [75]:
data2.head()

Unnamed: 0,Date,Sales,Profit
0,2010-01-01,679,303162.58
1,2010-01-02,924,342725.37
2,2010-01-03,857,713536.51
3,2010-01-04,250,101966.18
4,2010-01-05,461,336921.63


In [74]:
data2['Profit'] = data2['Profit'].round(2)

In [76]:
data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 3 columns):
Date      200 non-null datetime64[ns]
Sales     200 non-null int64
Profit    200 non-null float64
dtypes: datetime64[ns](1), float64(1), int64(1)
memory usage: 4.8 KB


In [24]:
data.head()

Unnamed: 0,Name,Place_of_Birth,Age,Member_Since,Job,Card,Location,Married,Race,Salary,Savings,Rent
0,Theodore Chavez,Honduras,39,2016-09-08,Ranger/warden,JCB 16 digit,"(40.63316, -74.13653, Port Richmond, US, Ameri...",1,Asian,13823,4662,2078
1,Julie Lane,Australia,39,2011-06-06,Senior tax professional/tax inspector,VISA 13 digit,"(40.67705, -73.89125, Cypress Hills, US, Ameri...",0,Black,58710,18369,2825
2,Daniel Dominguez,Portugal,98,2010-04-05,"Administrator, sports",Mastercard,"(42.4251, -71.06616, Malden, US, America/New_Y...",0,Hispanic,12838,1935,607
3,Jorge Castillo,Lebanon,28,2012-08-12,"Pharmacist, community",Mastercard,"(39.43534, -84.20299, Lebanon, US, America/New...",1,Asian,105857,30027,2115
4,Elizabeth Young,Bulgaria,64,2012-01-20,Diagnostic radiographer,VISA 13 digit,"(26.18924, -98.15529, San Juan, US, America/Ch...",1,White,37544,45356,2205


In [25]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 12 columns):
Name              200 non-null object
Place_of_Birth    200 non-null object
Age               200 non-null int64
Member_Since      200 non-null object
Job               200 non-null object
Card              200 non-null object
Location          200 non-null object
Married           200 non-null int64
Race              200 non-null object
Salary            200 non-null int64
Savings           200 non-null int64
Rent              200 non-null int64
dtypes: int64(5), object(7)
memory usage: 18.8+ KB
