# Pandas Introduction

In [1]:
# load the lobrary
import pandas as pd

In [2]:
# pandas version
print(pd.__version__)

1.5.3


### A Pandas Series is like a column in a table.

### It is a one-dimensional array holding data of any type.

In [3]:
a = [1, 7, 2]

myvar = pd.Series(a)

print(myvar)

0    1
1    7
2    2
dtype: int64


### If nothing else is specified, the values are labeled with their index number. First value has index 0, second value has index 1 etc.

### This label can be used to access a specified value.

In [4]:
print(myvar[0])

1


### With the index argument, you can name your own labels.

In [5]:
a = [1, 7, 2]

myvar = pd.Series(a, index = ["x", "y", "z"])

print(myvar)

x    1
y    7
z    2
dtype: int64


### When you have created labels, you can access an item by referring to the label.

In [6]:
print(myvar["y"])

7


### You can also use a key/value object, like a dictionary, when creating a Series.

In [7]:
calories = {"day1": 420, "day2": 380, "day3": 390}

myvar = pd.Series(calories)

print(myvar)

day1    420
day2    380
day3    390
dtype: int64


### To select only some of the items in the dictionary, use the index argument and specify only the items you want to include in the Series.

In [8]:
calories = {"day1": 420, "day2": 380, "day3": 390}

myvar = pd.Series(calories, index = ["day1", "day2"])

print(myvar)

day1    420
day2    380
dtype: int64


### Data sets in Pandas are usually multi-dimensional tables, called DataFrames.

### Series is like a column, a DataFrame is the whole table.

In [9]:
data = {
  "calories": [420, 380, 390],
  "duration": [50, 40, 45]
}

myvar = pd.DataFrame(data)

print(myvar)

   calories  duration
0       420        50
1       380        40
2       390        45


### A Pandas DataFrame is a 2 dimensional data structure, like a 2 dimensional array, or a table with rows and columns.

In [10]:
data = {
  "calories": [420, 380, 390],
  "duration": [50, 40, 45]
}

#load data into a DataFrame object:
df = pd.DataFrame(data)

print(df) 

   calories  duration
0       420        50
1       380        40
2       390        45


### As you can see from the result above, the DataFrame is like a table with rows and columns.

### Pandas use the loc attribute to return one or more specified row(s)

In [11]:
#refer to the row index:
print(df.loc[0])
#returns a pandas series

calories    420
duration     50
Name: 0, dtype: int64


In [12]:
#Return row 0 and 1:
#use a list of indexes:
print(df.loc[[0, 1]])

   calories  duration
0       420        50
1       380        40


### With the index argument, you can name your own indexes.

In [13]:
data = {
  "calories": [420, 380, 390],
  "duration": [50, 40, 45]
}

df = pd.DataFrame(data, index = ["day1", "day2", "day3"])

print(df) 

      calories  duration
day1       420        50
day2       380        40
day3       390        45


In [14]:
#refer to the named index:
print(df.loc["day2"])

calories    380
duration     40
Name: day2, dtype: int64


### If your data sets are stored in a file, Pandas can load them into a DataFrame.

In [15]:
df = pd.read_csv('/Users/luisosorio/Documents/PATH/Stars.csv')

In [16]:
print(df)
#If you have a large DataFrame with many rows, Pandas will only return the first 5 rows, and the last 5 rows:

     Temperature              L          R    A_M  Color Spectral_Class  Type
0           3068       0.002400     0.1700  16.12    Red              M     0
1           3042       0.000500     0.1542  16.60    Red              M     0
2           2600       0.000300     0.1020  18.70    Red              M     0
3           2800       0.000200     0.1600  16.65    Red              M     0
4           1939       0.000138     0.1030  20.06    Red              M     0
..           ...            ...        ...    ...    ...            ...   ...
235        38940  374830.000000  1356.0000  -9.93   Blue              O     5
236        30839  834042.000000  1194.0000 -10.63   Blue              O     5
237         8829  537493.000000  1423.0000 -10.73  White              A     5
238         9235  404940.000000  1112.0000 -11.23  White              A     5
239        37882  294903.000000  1783.0000  -7.80   Blue              O     5

[240 rows x 7 columns]


In [17]:
print(df.to_string())
#Tip: use to_string() to print the entire DataFrame.

     Temperature              L           R     A_M               Color Spectral_Class  Type
0           3068       0.002400     0.17000  16.120                 Red              M     0
1           3042       0.000500     0.15420  16.600                 Red              M     0
2           2600       0.000300     0.10200  18.700                 Red              M     0
3           2800       0.000200     0.16000  16.650                 Red              M     0
4           1939       0.000138     0.10300  20.060                 Red              M     0
5           2840       0.000650     0.11000  16.980                 Red              M     0
6           2637       0.000730     0.12700  17.220                 Red              M     0
7           2600       0.000400     0.09600  17.400                 Red              M     0
8           2650       0.000690     0.11000  17.450                 Red              M     0
9           2700       0.000180     0.13000  16.050                 Re

### The number of rows returned is defined in Pandas option settings.

### You can check your system's maximum rows with the pd.options.display.max_rows statement.

In [18]:
print(pd.options.display.max_rows) 

60


### In my system the number is 60, which means that if the DataFrame contains more than 60 rows, the print(df) 
### statement will return only the headers and the first and last 5 rows.

### You can change the maximum rows number with the same statement.

In [19]:
pd.options.display.max_rows = 9999
#Increase the maximum number of rows to display the entire DataFrame:

### Big data sets are often stored, or extracted as JSON.

### JSON is plain text, but has the format of an object, and is well known in the world of programming, including Pandas.

### In our examples we will be using a JSON file called 'data.json'.

In [20]:
# Load the JSON file into a DataFrame:
df = pd.read_json('/Users/luisosorio/Documents/PATH/data.json')

print(df.to_string())

#JSON = Python Dictionary
#JSON objects have the same format as Python dictionaries.

     Duration  Pulse  Maxpulse  Calories
0          60    110       130     409.1
1          60    117       145     479.0
2          60    103       135     340.0
3          45    109       175     282.4
4          45    117       148     406.0
5          60    102       127     300.5
6          60    110       136     374.0
7          45    104       134     253.3
8          30    109       133     195.1
9          60     98       124     269.0
10         60    103       147     329.3
11         60    100       120     250.7
12         60    106       128     345.3
13         60    104       132     379.3
14         60     98       123     275.0
15         60     98       120     215.2
16         60    100       120     300.0
17         45     90       112       NaN
18         60    103       123     323.0
19         45     97       125     243.0
20         60    108       131     364.2
21         45    100       119     282.0
22         60    130       101     300.0
23         45   

In [21]:
#Load a Python Dictionary into a DataFrame:
data = {
  "Duration":{
    "0":60,
    "1":60,
    "2":60,
    "3":45,
    "4":45,
    "5":60
  },
  "Pulse":{
    "0":110,
    "1":117,
    "2":103,
    "3":109,
    "4":117,
    "5":102
  },
  "Maxpulse":{
    "0":130,
    "1":145,
    "2":135,
    "3":175,
    "4":148,
    "5":127
  },
  "Calories":{
    "0":409,
    "1":479,
    "2":340,
    "3":282,
    "4":406,
    "5":300
  }
}

df = pd.DataFrame(data)

print(df) 

   Duration  Pulse  Maxpulse  Calories
0        60    110       130       409
1        60    117       145       479
2        60    103       135       340
3        45    109       175       282
4        45    117       148       406
5        60    102       127       300


In [22]:
#One of the most used method for getting a quick overview of the DataFrame, is the head() method.
#The head() method returns the headers and a specified number of rows, starting from the top.

#Get a quick overview by printing the first 10 rows of the DataFrame:
import pandas as pd

df = pd.read_csv('/Users/luisosorio/Documents/PATH/Stars.csv')

print(df.head(10))

   Temperature         L       R    A_M Color Spectral_Class  Type
0         3068  0.002400  0.1700  16.12   Red              M     0
1         3042  0.000500  0.1542  16.60   Red              M     0
2         2600  0.000300  0.1020  18.70   Red              M     0
3         2800  0.000200  0.1600  16.65   Red              M     0
4         1939  0.000138  0.1030  20.06   Red              M     0
5         2840  0.000650  0.1100  16.98   Red              M     0
6         2637  0.000730  0.1270  17.22   Red              M     0
7         2600  0.000400  0.0960  17.40   Red              M     0
8         2650  0.000690  0.1100  17.45   Red              M     0
9         2700  0.000180  0.1300  16.05   Red              M     0


In [23]:
#Print the first 5 rows of the DataFrame:
print(df.head())

   Temperature         L       R    A_M Color Spectral_Class  Type
0         3068  0.002400  0.1700  16.12   Red              M     0
1         3042  0.000500  0.1542  16.60   Red              M     0
2         2600  0.000300  0.1020  18.70   Red              M     0
3         2800  0.000200  0.1600  16.65   Red              M     0
4         1939  0.000138  0.1030  20.06   Red              M     0


In [24]:
#There is also a tail() method for viewing the last rows of the DataFrame.
#The tail() method returns the headers and a specified number of rows, starting from the bottom.

#Print the last 5 rows of the DataFrame:
print(df.tail()) 

     Temperature         L       R    A_M  Color Spectral_Class  Type
235        38940  374830.0  1356.0  -9.93   Blue              O     5
236        30839  834042.0  1194.0 -10.63   Blue              O     5
237         8829  537493.0  1423.0 -10.73  White              A     5
238         9235  404940.0  1112.0 -11.23  White              A     5
239        37882  294903.0  1783.0  -7.80   Blue              O     5


In [25]:
#The DataFrames object has a method called info(), that gives you more information about the data set.
print(df.info()) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Temperature     240 non-null    int64  
 1   L               240 non-null    float64
 2   R               240 non-null    float64
 3   A_M             240 non-null    float64
 4   Color           240 non-null    object 
 5   Spectral_Class  240 non-null    object 
 6   Type            240 non-null    int64  
dtypes: float64(3), int64(2), object(2)
memory usage: 13.2+ KB
None
