# 07_02: Importing data from text files

In [1]:
import math
import collections
import dataclasses
import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as pp

In [2]:
df = pd.read_csv('Planets.csv')
df

Unnamed: 0,Planet,Mass,Diameter,DayLength,SunDistance,OrbitPeriod,OrbitVelocity,MeanTemperature,SurfacePressure,Moons,Rings,MagneticField,FirstVisited,FirstMission
0,MERCURY,0.33,4879,4222.6,57.9,88.0,47.4,167,0.0,0,No,Yes,1974-03-29,Mariner 10
1,VENUS,4.87,12104,2802.0,108.2,224.7,35.0,464,92.0,0,No,No,1962-08-27,Mariner 2
2,EARTH,5.97,12756,24.0,149.6,365.2,29.8,15,1.0,1,No,Yes,,
3,MOON,0.073,3475,708.7,,27.3,1.0,-20,0.0,0,No,No,1959-09-12,Luna 2
4,MARS,0.642,6792,24.7,227.9,687.0,24.1,-65,0.01,2,No,No,1965-07-15,Mariner 4
5,JUPITER,1898.0,142984,9.9,778.6,4331.0,13.1,-110,,67,Yes,Yes,1973-12-04,Pioneer 10
6,SATURN,568.0,120536,10.7,1433.5,10747.0,9.7,-140,,62,Yes,Yes,1979-09-01,Pioneer 11
7,URANUS,86.8,51118,17.2,2872.5,30589.0,6.8,-195,,27,Yes,Yes,1986-01-24,Voyager 2
8,NEPTUNE,102.0,49528,16.1,4495.1,59800.0,5.4,-200,,14,Yes,Yes,1989-08-25,Voyager 2
9,PLUTO,0.0146,2370,153.3,5906.4,90560.0,4.7,-225,1e-05,5,No,,2015-07-14,New Horizons


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Planet           10 non-null     object 
 1   Mass             10 non-null     float64
 2   Diameter         10 non-null     object 
 3   DayLength        10 non-null     float64
 4   SunDistance      9 non-null      float64
 5   OrbitPeriod      10 non-null     object 
 6   OrbitVelocity    10 non-null     float64
 7   MeanTemperature  10 non-null     int64  
 8   SurfacePressure  6 non-null      float64
 9   Moons            10 non-null     int64  
 10  Rings            10 non-null     object 
 11  MagneticField    9 non-null      object 
 12  FirstVisited     9 non-null      object 
 13  FirstMission     9 non-null      object 
dtypes: float64(5), int64(2), object(7)
memory usage: 1.2+ KB


In [4]:
df = pd.read_csv('Planets.csv', thousands=',', parse_dates=['FirstVisited'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Planet           10 non-null     object        
 1   Mass             10 non-null     float64       
 2   Diameter         10 non-null     int64         
 3   DayLength        10 non-null     float64       
 4   SunDistance      9 non-null      float64       
 5   OrbitPeriod      10 non-null     float64       
 6   OrbitVelocity    10 non-null     float64       
 7   MeanTemperature  10 non-null     int64         
 8   SurfacePressure  6 non-null      float64       
 9   Moons            10 non-null     int64         
 10  Rings            10 non-null     object        
 11  MagneticField    9 non-null      object        
 12  FirstVisited     9 non-null      datetime64[ns]
 13  FirstMission     9 non-null      object        
dtypes: datetime64[ns](1), float64(6), int64(3), o

In [5]:
df = pd.read_csv('Planets.csv', thousands=',', parse_dates=['FirstVisited'],
                 dtype={'Diameter': np.float64, 'MeanTemperature': np.float64,
                        'Rings': 'category', 'MagneticField': 'category'})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Planet           10 non-null     object        
 1   Mass             10 non-null     float64       
 2   Diameter         10 non-null     float64       
 3   DayLength        10 non-null     float64       
 4   SunDistance      9 non-null      float64       
 5   OrbitPeriod      10 non-null     float64       
 6   OrbitVelocity    10 non-null     float64       
 7   MeanTemperature  10 non-null     float64       
 8   SurfacePressure  6 non-null      float64       
 9   Moons            10 non-null     int64         
 10  Rings            10 non-null     category      
 11  MagneticField    9 non-null      category      
 12  FirstVisited     9 non-null      datetime64[ns]
 13  FirstMission     9 non-null      object        
dtypes: category(2), datetime64[ns](1), float64(8)

In [6]:
df.head()

Unnamed: 0,Planet,Mass,Diameter,DayLength,SunDistance,OrbitPeriod,OrbitVelocity,MeanTemperature,SurfacePressure,Moons,Rings,MagneticField,FirstVisited,FirstMission
0,MERCURY,0.33,4879.0,4222.6,57.9,88.0,47.4,167.0,0.0,0,No,Yes,1974-03-29,Mariner 10
1,VENUS,4.87,12104.0,2802.0,108.2,224.7,35.0,464.0,92.0,0,No,No,1962-08-27,Mariner 2
2,EARTH,5.97,12756.0,24.0,149.6,365.2,29.8,15.0,1.0,1,No,Yes,NaT,
3,MOON,0.073,3475.0,708.7,,27.3,1.0,-20.0,0.0,0,No,No,1959-09-12,Luna 2
4,MARS,0.642,6792.0,24.7,227.9,687.0,24.1,-65.0,0.01,2,No,No,1965-07-15,Mariner 4


In [7]:
df.convert_dtypes(dtype_backend='pyarrow').info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype                 
---  ------           --------------  -----                 
 0   Planet           10 non-null     string[pyarrow]       
 1   Mass             10 non-null     double[pyarrow]       
 2   Diameter         10 non-null     int64[pyarrow]        
 3   DayLength        10 non-null     double[pyarrow]       
 4   SunDistance      9 non-null      double[pyarrow]       
 5   OrbitPeriod      10 non-null     double[pyarrow]       
 6   OrbitVelocity    10 non-null     double[pyarrow]       
 7   MeanTemperature  10 non-null     int64[pyarrow]        
 8   SurfacePressure  6 non-null      double[pyarrow]       
 9   Moons            10 non-null     int64[pyarrow]        
 10  Rings            10 non-null     category              
 11  MagneticField    9 non-null      category              
 12  FirstVisited     9 non-null      timest

In [8]:
pd.read_csv('Planets-whitespace.csv', sep=r'\s+').head()

Unnamed: 0,Planet,Mass,Diameter,DayLength,SunDistance,OrbitPeriod,OrbitVelocity,MeanTemperature,SurfacePressure,Moons,Rings,MagneticField,FirstVisited,FirstMission
MERCURY,0.33,4879,4222.6,57.9,88.0,47.4,167,0.0,0,No,Yes,1974-03-29,Mariner,10.0
VENUS,4.87,12104,2802.0,108.2,224.7,35.0,464,92.0,0,No,No,1962-08-27,Mariner,2.0
EARTH,5.97,12756,24.0,149.6,365.2,29.8,15,1.0,1,No,Yes,,,
MOON,0.073,3475,708.7,,27.3,1.0,-20,0.0,0,No,No,1959-09-12,Luna,2.0
MARS,0.642,6792,24.7,227.9,687.0,24.1,-65,0.01,2,No,No,1965-07-15,Mariner,4.0


In [9]:
pd.read_csv('Planets-noheader.csv', sep=r'\s+').head()

Unnamed: 0,MERCURY,0.33,4879,4222.6,57.9,88,47.4,167,0.0,0,No,Yes,1974-03-29,Mariner,10
0,VENUS,4.87,12104,2802.0,108.2,224.7,35.0,464,92.0,0,No,No,1962-08-27,Mariner,2.0
1,EARTH,5.97,12756,24.0,149.6,365.2,29.8,15,1.0,1,No,Yes,,,
2,MOON,0.073,3475,708.7,,27.3,1.0,-20,0.0,0,No,No,1959-09-12,Luna,2.0
3,MARS,0.642,6792,24.7,227.9,687.0,24.1,-65,0.01,2,No,No,1965-07-15,Mariner,4.0
4,JUPITER,1898.0,142984,9.9,778.6,4331.0,13.1,-110,,67,Yes,Yes,1973-12-04,Pioneer,10.0


In [10]:
pd.read_csv('Planets-noheader.csv', sep=r'\s+', header=None).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,MERCURY,0.33,4879,4222.6,57.9,88.0,47.4,167,0.0,0,No,Yes,1974-03-29,Mariner,10.0
1,VENUS,4.87,12104,2802.0,108.2,224.7,35.0,464,92.0,0,No,No,1962-08-27,Mariner,2.0
2,EARTH,5.97,12756,24.0,149.6,365.2,29.8,15,1.0,1,No,Yes,,,
3,MOON,0.073,3475,708.7,,27.3,1.0,-20,0.0,0,No,No,1959-09-12,Luna,2.0
4,MARS,0.642,6792,24.7,227.9,687.0,24.1,-65,0.01,2,No,No,1965-07-15,Mariner,4.0


In [11]:
pd.read_csv('Planets-noheader.csv', sep=r'\s+', header=None,
            names=['Planet', 'Mass', 'Diameter', 'DayLength', 'SunDistance', 'OrbitPeriod',
                   'OrbitVelocity', 'MeanTemperature', 'SurfacePressure', 'Moons', 'Rings',
                   'MagneticField', 'FirstVisited', 'FirstMission']).head()

Unnamed: 0,Planet,Mass,Diameter,DayLength,SunDistance,OrbitPeriod,OrbitVelocity,MeanTemperature,SurfacePressure,Moons,Rings,MagneticField,FirstVisited,FirstMission
MERCURY,0.33,4879,4222.6,57.9,88.0,47.4,167,0.0,0,No,Yes,1974-03-29,Mariner,10.0
VENUS,4.87,12104,2802.0,108.2,224.7,35.0,464,92.0,0,No,No,1962-08-27,Mariner,2.0
EARTH,5.97,12756,24.0,149.6,365.2,29.8,15,1.0,1,No,Yes,,,
MOON,0.073,3475,708.7,,27.3,1.0,-20,0.0,0,No,No,1959-09-12,Luna,2.0
MARS,0.642,6792,24.7,227.9,687.0,24.1,-65,0.01,2,No,No,1965-07-15,Mariner,4.0


In [12]:
df.to_csv('myplanets.csv', sep='\t')   # \r encodes tab in a regular Python string
print(open('myplanets.csv').read())

	Planet	Mass	Diameter	DayLength	SunDistance	OrbitPeriod	OrbitVelocity	MeanTemperature	SurfacePressure	Moons	Rings	MagneticField	FirstVisited	FirstMission
0	MERCURY	0.33	4879.0	4222.6	57.9	88.0	47.4	167.0	0.0	0	No	Yes	1974-03-29	Mariner 10
1	VENUS	4.87	12104.0	2802.0	108.2	224.7	35.0	464.0	92.0	0	No	No	1962-08-27	Mariner 2
2	EARTH	5.97	12756.0	24.0	149.6	365.2	29.8	15.0	1.0	1	No	Yes		
3	MOON	0.073	3475.0	708.7		27.3	1.0	-20.0	0.0	0	No	No	1959-09-12	Luna 2
4	MARS	0.642	6792.0	24.7	227.9	687.0	24.1	-65.0	0.01	2	No	No	1965-07-15	Mariner 4
5	JUPITER	1898.0	142984.0	9.9	778.6	4331.0	13.1	-110.0		67	Yes	Yes	1973-12-04	Pioneer 10
6	SATURN	568.0	120536.0	10.7	1433.5	10747.0	9.7	-140.0		62	Yes	Yes	1979-09-01	Pioneer 11
7	URANUS	86.8	51118.0	17.2	2872.5	30589.0	6.8	-195.0		27	Yes	Yes	1986-01-24	Voyager 2
8	NEPTUNE	102.0	49528.0	16.1	4495.1	59800.0	5.4	-200.0		14	Yes	Yes	1989-08-25	Voyager 2
9	PLUTO	0.0146	2370.0	153.3	5906.4	90560.0	4.7	-225.0	1e-05	5	No		2015-07-14	New Horizons



In [13]:
df.to_csv('myplanets.csv', sep='\t', na_rep='NA')
print(open('myplanets.csv').read())

	Planet	Mass	Diameter	DayLength	SunDistance	OrbitPeriod	OrbitVelocity	MeanTemperature	SurfacePressure	Moons	Rings	MagneticField	FirstVisited	FirstMission
0	MERCURY	0.33	4879.0	4222.6	57.9	88.0	47.4	167.0	0.0	0	No	Yes	1974-03-29	Mariner 10
1	VENUS	4.87	12104.0	2802.0	108.2	224.7	35.0	464.0	92.0	0	No	No	1962-08-27	Mariner 2
2	EARTH	5.97	12756.0	24.0	149.6	365.2	29.8	15.0	1.0	1	No	Yes	NA	NA
3	MOON	0.073	3475.0	708.7	NA	27.3	1.0	-20.0	0.0	0	No	No	1959-09-12	Luna 2
4	MARS	0.642	6792.0	24.7	227.9	687.0	24.1	-65.0	0.01	2	No	No	1965-07-15	Mariner 4
5	JUPITER	1898.0	142984.0	9.9	778.6	4331.0	13.1	-110.0	NA	67	Yes	Yes	1973-12-04	Pioneer 10
6	SATURN	568.0	120536.0	10.7	1433.5	10747.0	9.7	-140.0	NA	62	Yes	Yes	1979-09-01	Pioneer 11
7	URANUS	86.8	51118.0	17.2	2872.5	30589.0	6.8	-195.0	NA	27	Yes	Yes	1986-01-24	Voyager 2
8	NEPTUNE	102.0	49528.0	16.1	4495.1	59800.0	5.4	-200.0	NA	14	Yes	Yes	1989-08-25	Voyager 2
9	PLUTO	0.0146	2370.0	153.3	5906.4	90560.0	4.7	-225.0	1e-05	5	No	NA	2015-07-14	New Horizons



In [14]:
df.to_csv('myplanets.csv', sep='\t', na_rep='NA', index=False)
print(open('myplanets.csv').read())

Planet	Mass	Diameter	DayLength	SunDistance	OrbitPeriod	OrbitVelocity	MeanTemperature	SurfacePressure	Moons	Rings	MagneticField	FirstVisited	FirstMission
MERCURY	0.33	4879.0	4222.6	57.9	88.0	47.4	167.0	0.0	0	No	Yes	1974-03-29	Mariner 10
VENUS	4.87	12104.0	2802.0	108.2	224.7	35.0	464.0	92.0	0	No	No	1962-08-27	Mariner 2
EARTH	5.97	12756.0	24.0	149.6	365.2	29.8	15.0	1.0	1	No	Yes	NA	NA
MOON	0.073	3475.0	708.7	NA	27.3	1.0	-20.0	0.0	0	No	No	1959-09-12	Luna 2
MARS	0.642	6792.0	24.7	227.9	687.0	24.1	-65.0	0.01	2	No	No	1965-07-15	Mariner 4
JUPITER	1898.0	142984.0	9.9	778.6	4331.0	13.1	-110.0	NA	67	Yes	Yes	1973-12-04	Pioneer 10
SATURN	568.0	120536.0	10.7	1433.5	10747.0	9.7	-140.0	NA	62	Yes	Yes	1979-09-01	Pioneer 11
URANUS	86.8	51118.0	17.2	2872.5	30589.0	6.8	-195.0	NA	27	Yes	Yes	1986-01-24	Voyager 2
NEPTUNE	102.0	49528.0	16.1	4495.1	59800.0	5.4	-200.0	NA	14	Yes	Yes	1989-08-25	Voyager 2
PLUTO	0.0146	2370.0	153.3	5906.4	90560.0	4.7	-225.0	1e-05	5	No	NA	2015-07-14	New Horizons



In [15]:
df.to_csv('myplanets.csv', sep='\t', na_rep='NA', header=False, index=False)
print(open('myplanets.csv').read())

MERCURY	0.33	4879.0	4222.6	57.9	88.0	47.4	167.0	0.0	0	No	Yes	1974-03-29	Mariner 10
VENUS	4.87	12104.0	2802.0	108.2	224.7	35.0	464.0	92.0	0	No	No	1962-08-27	Mariner 2
EARTH	5.97	12756.0	24.0	149.6	365.2	29.8	15.0	1.0	1	No	Yes	NA	NA
MOON	0.073	3475.0	708.7	NA	27.3	1.0	-20.0	0.0	0	No	No	1959-09-12	Luna 2
MARS	0.642	6792.0	24.7	227.9	687.0	24.1	-65.0	0.01	2	No	No	1965-07-15	Mariner 4
JUPITER	1898.0	142984.0	9.9	778.6	4331.0	13.1	-110.0	NA	67	Yes	Yes	1973-12-04	Pioneer 10
SATURN	568.0	120536.0	10.7	1433.5	10747.0	9.7	-140.0	NA	62	Yes	Yes	1979-09-01	Pioneer 11
URANUS	86.8	51118.0	17.2	2872.5	30589.0	6.8	-195.0	NA	27	Yes	Yes	1986-01-24	Voyager 2
NEPTUNE	102.0	49528.0	16.1	4495.1	59800.0	5.4	-200.0	NA	14	Yes	Yes	1989-08-25	Voyager 2
PLUTO	0.0146	2370.0	153.3	5906.4	90560.0	4.7	-225.0	1e-05	5	No	NA	2015-07-14	New Horizons



In [16]:
df.to_csv('myplanets.csv', sep='\t', na_rep='NA', index=False, header=False, date_format='%d/%m/%Y')
print(open('myplanets.csv').read())

MERCURY	0.33	4879.0	4222.6	57.9	88.0	47.4	167.0	0.0	0	No	Yes	29/03/1974	Mariner 10
VENUS	4.87	12104.0	2802.0	108.2	224.7	35.0	464.0	92.0	0	No	No	27/08/1962	Mariner 2
EARTH	5.97	12756.0	24.0	149.6	365.2	29.8	15.0	1.0	1	No	Yes	NA	NA
MOON	0.073	3475.0	708.7	NA	27.3	1.0	-20.0	0.0	0	No	No	12/09/1959	Luna 2
MARS	0.642	6792.0	24.7	227.9	687.0	24.1	-65.0	0.01	2	No	No	15/07/1965	Mariner 4
JUPITER	1898.0	142984.0	9.9	778.6	4331.0	13.1	-110.0	NA	67	Yes	Yes	04/12/1973	Pioneer 10
SATURN	568.0	120536.0	10.7	1433.5	10747.0	9.7	-140.0	NA	62	Yes	Yes	01/09/1979	Pioneer 11
URANUS	86.8	51118.0	17.2	2872.5	30589.0	6.8	-195.0	NA	27	Yes	Yes	24/01/1986	Voyager 2
NEPTUNE	102.0	49528.0	16.1	4495.1	59800.0	5.4	-200.0	NA	14	Yes	Yes	25/08/1989	Voyager 2
PLUTO	0.0146	2370.0	153.3	5906.4	90560.0	4.7	-225.0	1e-05	5	No	NA	14/07/2015	New Horizons

