### Partes 1 y 2: 


##### (Part1) Getting Started with Data Analysis - Installation and Loading Data
##### (Part2) - DataFrame and Series Basics.ipynb

In [None]:

import pandas as pdimport pandas as pd

df = pd.read_csv('../data/survey_results_public.csv')
schema_df = pd.read_csv('../data/survey_results_schema.csv')

df

schema_df

df.shape

df.info()

pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 100)
pd.set_option('display.min_rows', 100)
pd.set_option('display.expand_frame_repr', True)

df

df.tail(10)

df.head(5)

df.iloc[1:,2]

df.loc[10, ['CompTotal','CompFreq','Currency','Employment']]



# Partes 3 y 4: 

##### (3) Indexes: How to Set, Reset, and Use Indexes
##### (4)Filtering: Using Conditionals to Filter Rows and Columns

In [None]:
## (3) Indexes: How to Set, Reset, and Use Indexes

people = {
"first": ["Corey","Jane","Jhon"],
"last": ["Schafer","Doe","Doe"],
"email": ["CoreyMSchager@gmail.com","loco@gamil.com","lineal@gmail.com"],
}

import pandas as pd

df = pd.DataFrame(people)

df

df['email']

df.set_index('email', inplace=True)

df

df.index

df.loc['lineal@gmail.com', 'last']

df.reset_index(inplace=True)

# Working with stackoverflow data 

dfs = pd.read_csv('../data/survey_results_public.csv', index_col='ResponseId')
schema_dfs = pd.read_csv('../data/survey_results_schema.csv', index_col='qname')

pd.set_option('display.max_columns',79)
pd.set_option('display.max_rows',73)

dfs.head()

dfs.set_index('ResponseId', inplace=True)

schema_dfs

schema_dfs.loc['Employment', 'question']

schema_dfs.sort_index()

# ------------------------------END--------------------------------

## (4)Filtering: Using Conditionals to Filter Rows and Columns

df

filt = (df['last'] == 'Doe') & (df['first'] == 'Jane')

df[filt]

df.loc[-filt, 'email']

# Working with Stackoverflow Data 

dfs.head()

high_salary = (dfs['CompTotal'] > 200000)

dfs.loc[high_salary, ['CompTotal', 'CompFreq', 'Currency', 'Country', 'LanguageHaveWorkedWith']]

countries = ['Uruguay','Argentina','Paraguay']
search = (dfs['Country'].isin(countries))

dfs.loc[search, ['Country' ,'CompTotal', 'Currency', 'CompFreq', 'LanguageHaveWorkedWith']]

pd.set_option('display.max_columns', 70)
pd.set_option('display.max_rows', 600)
pd.set_option('display.min_rows', 600)

containFilter = dfs['LanguageHaveWorkedWith'].str.contains('Python', na=False)

containFilter

dfs.loc[containFilter, 'LanguageHaveWorkedWith']

# Partes 5 y 6: 

##### (5): Updating Rows and Columns - Modifying Data Within DataFrames
##### (Part 6) Add or Remove Rows and Columns From DataFrames

In [None]:
# (5): Updating Rows and Columns - Modifying Data Within DataFrames

#### In this Python Programming, we will be learning how to modify the data within our DataFrames. We will use some of the filtering techniques we learned in the last video to update values conditionally, and we will also be learning how to use the apply, map, and applymap method. Let's get started...

people = {
"first": ["Corey","Jane","Jhon"],
"last": ["Schafer","Doe","Doe"],
"email": ["CoreyMSchager@gmail.com","loco@gamil.com","lineal@gmail.com"],
}

import pandas as pd

df = pd.DataFrame(people)

df

df.columns

df.columns = ['first_name', 'last_name', 'email']

df

df.columns = [x.lower() for x in df.columns]
df

df.columns = df.columns.str.replace(' ', "_")
df

df.rename(columns={'first_name': 'first', 'last_name': 'last'}, inplace=True)

df.loc[2] = ['Imanol','Aguer','imanolaguer1@gmail.com']
df

df.loc[2, ['last','email']] = ['Doe','lineal@gmail.com']
df

# This is a Common error when try to change data
filt = (df['email'] == 'lineal@gmail.com')
df[filt]['last'] = 'Smith'

# Asi se soluciona... con La manera correcta de modificar data en el DataFrame
filt = (df['email'] == 'lineal@gmail.com')
df.loc[filt, 'last'] = ['Pnachito']

df['email'] = df['email'].str.lower()
df

#### Recuerda estos metodos
##### 1- apply 2. map 3. applymap 4.replace

## apply method 

df['email'].apply(len)

def update_email(email):
    return email.upper()

df['email'] = df['email'].apply(update_email)
# df

df['email'] = df['email'].apply(lambda x: x.lower()) # Esta es una funcion Lambda
df

df.apply(len)

df.apply(pd.Series.min)

### applymap method

df.applymap(len) # <---- te dara Len() de cada elemento individual del DataFrame

df.applymap(str.upper)

### map method 

df['first'].map({'Corey': 'Lionel','Jane': 'Cristiano'}) #atencion: no pone los cambios pernanentes

df['first'] = df['first'].replace({'Corey': 'Lionel', 'Jane':'Cristiano'}) #Atencion: aqui si los cambios quedan
df

# Working with Stackoverflow Data 

dfs = pd.read_csv('../data/survey_results_public.csv')
schema_dfs = pd.read_csv('../data/survey_results_schema.csv')

dfs.head(20)

dfs['Currency']

dfs.rename(columns={'Currency': 'LocalCurrency'}, inplace=True)

dfs['LocalCurrency']

dfs['SurveyEase']

dfs['SurveyEase'] = dfs['SurveyEase'].map({'Easy': 'Facil', 'Difficult': 'Dificil', 'Neither easy nor difficult':'ni pedos'})
# A veces es conveniente usar metodo .replace en vez de .map

dfs.head()

# (Part 6) Add or Remove Rows and Columns From DataFrames 


### This is how we add columns to our DataFrame

df['first'] + " " + df['last']

df["Full_Name"] = df['first'] + " " + df['last'] # <--- Created a new column

df

### This is how we remove columns to our DataFrame 

df.drop(columns=['first', 'last'], inplace=True) # <--- Borra las columnas first y last
df
# Hey, Imanol other thins!
# You can simply delete columns by using "del" function.
# For example: del df['full_name']

df['Full_Name'].str.split(" ", expand=True)

df[['first','last']] = df['Full_Name'].str.split(" ", expand=True)

df

# df.drop(columns=['Full_Name'])

# df.append({'first': 'Tony'}) # Append was deprecated in new version of pandas, now are used concat
df = pd.concat([df, pd.DataFrame([{'first': 'Tony'}])]) # <-- Add single row / ignore index

df

## Second DataFrame... people2

people2 = {
    "first": ["George", "Martin"],
    "last": ["Hotz", "Shkreli"],
    "email": ["Hotz@gmail.com", "Shkreli@gamil.com"],
}
df2 = pd.DataFrame(people2)
df2

df = pd.concat([df,df2], ignore_index=True, sort=False) # <--- This replaces the method append of min 11:30

df

df.drop(index=(3), inplace=True) # <--- remover un row individual
df

df.drop(index=[4,5], inplace=True) # <--- remover un grupo de rows
df

df

df.drop(index=df[df['last'] == 'Doe'].index, inplace=True) # <--- Remover rows con un condicional
# Tambien recuerda que puedes almacenar el condicional en una variable y solo pasar la variable al drop index=[filt]
filt = df['last'] == 'Doe' # <-- Usando

df



# Partes 7 y 8: 

##### (Part 7): Sorting Data
##### (Part 8) - Grouping and Aggregating - Analyzing and Exploring Your Data

In [None]:
## (Part 7): Sorting Data

people = {
    "first": ["Corey", "Jane", "Jhon","Adam"],
    "last": ["Schafer", "Doe", "Doe", "Doe"],
    "email": ["CoreyMSchager@gmail.com", "loco@gamil.com", "lineal@gmail.com", "A@email.com"],
}

import pandas as pd

df = pd.DataFrame(people)
df

df.sort_values(by='last', ascending=False)

df.sort_values(by=['last','first'], ascending=False)

df.sort_values(by=['last','first'], ascending=[False, True], inplace=True)
df


df.sort_index()

df['last'].sort_values()

## Working With StakOverflow Data

dfs = pd.read_csv('../data/survey_results_public.csv')
schema_dfs = pd.read_csv('../data/survey_results_schema.csv')

pd.set_option('display.max_columns', 79)
pd.set_option('display.max_rows', 50)
pd.set_option('display.min_rows', 50)
dfs.head()

dfs.sort_values(by=['Country', 'Currency', 'CompTotal'], ascending=[True, False, False], inplace=True)

dfs[['Country', 'Currency', 'CompTotal']].head(150)


dfs['CompTotal'].nlargest(10)

dfs.nlargest(10, 'CompTotal')

dfs.nsmallest(10, 'CompTotal')

## (Part 8) - Grouping and Aggregating - Analyzing and Exploring Your Data

dfs.head()

dfs['CompTotal'].head(15)

dfs['CompTotal'].median()

dfs.describe()

dfs['CompTotal'].count()

dfs['SOAccount']

dfs['SOAccount'].value_counts()

dfs['Sexuality']

schema_dfs.loc[47]

dfs['Sexuality'].value_counts(normalize=True)

dfs['Country'].value_counts()

country_grp = dfs.groupby(dfs['Country'])

country_grp.get_group('Argentina')

filt = dfs['Country'] == 'Argentina'
dfs.loc[filt]['Sexuality'].value_counts()

country_grp['Sexuality'].value_counts().loc['Argentina']

country_grp['CompTotal'].median().loc['Argentina']

country_grp['CompTotal'].agg(['median','mean']).loc['Canada']

country_grp['LanguageHaveWorkedWith'].str.contains('Python').sum()

country_grp['LanguageHaveWorkedWith'].apply(
   lambda x: x.str.contains('Python').sum())

country_respondents = dfs['Country'].value_counts()
country_respondents

country_uses_python = country_grp['LanguageHaveWorkedWith'].apply(lambda x: x.str.contains('Python').sum())
country_uses_python

python_df = pd.concat([country_respondents, country_uses_python], axis='columns', sort=False)
python_df

python_df.rename(columns={'count':'cantidadRespondieron', 'LanguageHaveWorkedWith':'LosQueSabenPyhon'}, inplace=True)

python_df

python_df['PerKnowPython'] = (python_df['LosQueSabenPyhon'] / python_df['cantidadRespondieron']) * 100

python_df

python_df.sort_values(by='PerKnowPython', ascending=False, inplace=True)

python_df.head(50)

python_df.loc['Argentina']

# --------------- END --------------- 



## Partes 9 y 10

#### (Part 9) - Cleaning Data - Casting Datatypes and Handling Missing Values
#### (Part 10) - Working with Dates and Time Series Data

In [None]:
from datetime import datetime
import pandas as pd
import numpy as np

people = {
    'first': ['Corey', 'Jane', 'John', 'Chris', np.nan, None, 'NA'],
    'last': ['Schafer', 'Doe', 'Doe', 'Schafer', np.nan, np.nan, 'Missing'],
    'email': ['CoreyMSchafer@gmail.com', 'JaneDoe@email.com', 'JohnDoe@email.com', None, np.nan, 'Anonymous@email.com', 'NA'],
    'age': ['33', '55', '63', '36', None, None, 'Missing']
}

df = pd.DataFrame(people)

df.replace('NA', np.nan, inplace=True)

df.replace('Missing', np.nan, inplace=True)
df

df.dropna()

df.dropna(axis='index', how='all', subset=['last', 'email'])

df.isna()

df.fillna(0)

df['age'].mean()

df['age'] = df['age'].astype(float)
df.dtypes

df['age'].mean()

# Working with StackOverflow Data

def d_parser(x): return datetime.strptime(x, '%Y-%m-%d %I-%p')


dfs = pd.read_csv('../data/survey_results_public.csv', index_col='ResponseId')
dfs_schema = pd.read_csv(
    '../data/survey_results_schema.csv', index_col='qname')

pd.set_option('display.max_columns', 85)
pd.set_option('display.max_rows', 85)

dfs.head()

dfs['YearsCode'].head(10)

dfs['YearsCode'].unique()

dfs['YearsCode'].replace('Less than 1 year', 0, inplace=True)

dfs['YearsCode'].replace('More than 50 years', 51, inplace=True)

dfs['YearsCode'] = dfs['YearsCode'].astype(float)

dfs['YearsCode'].mean()

dfs['YearsCode'].median()

# (Part 10) - Working with Dates and Time Series Data

def d_parser(x): return datetime.strptime(x, '%Y-%m-%d %I-%p')


dft = pd.read_csv('../data/ETH_1h.csv',
                  parse_dates=['Date'], date_parser=d_parser)

dft.head()

# dft.loc[0, 'Date'].day_name()
# dft['Date'] = pd.to_datetime(dft['Date'], format='%Y-%m-%d %I-%p')
# dft['Date']

dft.loc[0, 'Date'].day_name()

dft['Date'].dt.day_name()

dft['DayOfWeek'] = dft['Date'].dt.day_name()
dft

dft['Date'].min()

dft['Date'].max()

dft['Date'].max() - dft['Date'].min()

filt = (dft['Date'] >= pd.to_datetime('2019-01-01')
        ) & (dft['Date'] < pd.to_datetime('2020-01-01'))
dft.loc[filt]

dft.set_index('Date', inplace=True)
dft

dft.loc['2019']

dft['2020-01':'2020-02']

dft['2020-01':'2020-02']['Close'].mean()

dft['2020-01-01':'2020-01-01']['High'].max()
highs = dft['High'].resample('D').max()
highs['2020-01-01']

# %matplotlib inline
# highs.plot()

dft.resample('W').mean()
dft

dft.resample('W').agg(
    {'Close': 'mean', 'High': 'max', 'Low': 'min', 'Volume': 'sum'})

##  ########### END ########### ##


## Parte 11

#### (Part 11) - Reading and Writing Data to Different Sources - Excel, JSON, SQL, Etc.

In [None]:
from sqlalchemy import create_engine
import mysql.connector
import pandas as pd

df = pd.read_csv('../data/survey_results_public.csv', index_col='ResponseId')
schema_df = pd.read_csv('../data/survey_results_schema.csv', index_col='qname')

pd.set_option('display.max_columns', 85)
pd.set_option('display.max_rows', 85)

df.head()

filt = (df['Country'] == 'India')

india_df = df.loc[filt]
india_df.head()

india_df.to_csv('modified.csv')
india_df.to_csv('modified.tsv', sep='>')

india_df.to_excel('excel_writer.xlsx')

!pip install openpyxl
india_df.to_excel('modified.xlsx')
test = pd.read_excel('modified.xlsx', index_col='ResponseId')
test.head()

india_df.to_json('modified.json', orient='records', lines=True)
test = pd.read_json('modified.json', orient='records', lines=True)
test.head()


!pip install mysql-connector-python
!pip install SQLAlchemy

engine = create_engine('mysql+mysqlconnector://root:root@localhost/usingdata')
connection = engine.connect()
tabla = 'sample'
india_df.to_sql(name=tabla, con=connection, if_exists='append', index=False)
sql_df = pd.read_sql(tabla, con=connection)
sql_df.head()
sql_df = pd.read_sql_query('SELECT * FROM sample', con=connection)
sql_df.head()

posts_df = pd.read_json(
    'https://raw.githubusercontent.com/CoreyMSchafer/code_snippets/master/Python/Flask_Blog/snippets/posts.json')
posts_df.head()

########################### END ########################
# In this Python Programming video, we will be learning how to load and save data using multiple different sources. We will learn how to read/write data to CSV, JSON, Excel, SQL, and more. This covers the vast majority of formats you'll see in the data science field and will be extremely useful to know. Let's get started...

# Video Timestamps:
# Read CSV - 0: 56
# Write CSV - 3: 20
# Write TSV - 4: 40
# Read TSV - 6: 00
# Write Excel - 6: 15
# Read Excel - 10: 42 (Start at 6: 15 to see installed packages)
# Write JSON - 12: 18
# Read JSON - 15: 41
# Write SQL - 16: 59
# Read SQL - 24: 57 (Start at 16: 59 to see installed packages)

# The code for this video can be found at:
# bit.ly/Pandas-11
