In [1]:
import pandas as pd

In [2]:
# reading the csv file
df_exams = pd.read_csv('StudentsPerformance.csv')

In [4]:
# Si queremos mostrar todas las columnas pero solo las filas de la 100 a la 104
df_exams.iloc[100:105, :]

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
100,male,group B,some college,standard,none,79,67,67
101,male,group D,bachelor's degree,standard,completed,68,74,74
102,female,group D,associate's degree,standard,none,85,91,89
103,male,group B,high school,standard,completed,60,44,47
104,male,group C,some college,standard,completed,98,86,90


In [5]:
# iloc sirve para indexación basada en posición (por índices numéricos, no por nombres).
# el primer parametro (100:105) selecciona las columnas desde la 100 hasta la 104 (el límite superior es excluyente, por eso se pone 105).
# Y el segundo parametro indica que queremos todas las columnas.

### Adding a new column

Let's say we want to add a new score. For example, a language score

#### 1.1 Adding a new column with a scalar value

In [7]:
# A scalar value is simply a single value. And in this case, all the rows in the column are going to have the same value
df_exams['language score'] = 76
df_exams

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,language score
0,female,group B,bachelor's degree,standard,none,72,72,74,76
1,female,group C,some college,standard,completed,69,90,88,76
2,female,group B,master's degree,standard,none,90,95,93,76
3,male,group A,associate's degree,free/reduced,none,47,57,44,76
4,male,group C,some college,standard,none,76,78,75,76
...,...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95,76
996,male,group C,high school,free/reduced,none,62,55,55,76
997,female,group C,high school,free/reduced,completed,59,71,65,76
998,female,group D,some college,standard,completed,68,78,77,76


In [8]:
# So we add the language score feature, and all the rows have the same value (this value could be the mediana, for example)
# if we have the score of almost all the students but there are missing values, we could complete it with the mean or the mediana.

#### 1.2 Adding a new column with an array

The dataframe has 1000 rows, so we need to create an array with 1000 elements.

In [9]:
import numpy as np

In [19]:
# creating an array of 1000 elements
a = np.arange(0, 1000)
# this gives us a range of integer numbers that starts from 0 all the way up to< 1000. 
# a = [0, 1, 2, 3, ..., 997, 999, 999]

In [20]:
# length of the array
len(a)

1000

In [37]:
# adding a new column to datafram with an array
df_exams['language score'] = a
df_exams.tail()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,language score
995,female,group E,master's degree,standard,completed,88,99,95,995
996,male,group C,high school,free/reduced,none,62,55,55,996
997,female,group C,high school,free/reduced,completed,59,71,65,997
998,female,group D,some college,standard,completed,68,78,77,998
999,female,group D,some college,free/reduced,none,77,86,86,999


In [23]:
# now language score starts with 0 and ends with 999. but scores are supposed to be between 0 and 100.
# And also it is an increasing sequence.. but usually students have random scores between 0 and 100

In [34]:
# Creating random integer numbers between 1 and 100
lscore = np.random.randint(1, 101, size=1000)
# arguments: lowest value, highest value, size of the array

In [35]:
len(lscore)

1000

In [36]:
# min value inclusive and high value exclusive
min(lscore), max(lscore)

(np.int32(1), np.int32(100))

In [38]:
# adding a new column to a datafram with an array
df_exams['language score'] = lscore
df_exams.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,language score
0,female,group B,bachelor's degree,standard,none,72,72,74,92
1,female,group C,some college,standard,completed,69,90,88,2
2,female,group B,master's degree,standard,none,90,95,93,90
3,male,group A,associate's degree,free/reduced,none,47,57,44,100
4,male,group C,some college,standard,none,76,78,75,86


In [39]:
# And as we can see here, we have the same column, but now the scores are random integer numbers from 0 to 100. 
# now this data looks like real scores.

In [40]:
# Creating random float numbers between 1 and 100
np.random.uniform(1, 100, size=10)

array([22.72501797, 96.01311538, 28.20775128, 26.45318244, 91.45029358,
       35.40179736, 27.29628503, 17.48761813, 30.34254469, 38.98441965])

In [None]:
# And those are two ways to add new columns to a dataframe..