# Interactuamos con los ficheros y con el sistema operativo
Trataremos los ficheros, manipularlos, y leer/guardar desde la consola

In [None]:
# Crear una carpeta
!mkdir carpeta

In [None]:
# Vamos a la carpeta y creamos un fichero vacío
# !cd carpeta && touch fichero.py (Linux / Mac)
!cd carpeta && type nul > fichero.py # Windows

In [None]:
# Para interactuar con el sistema operativo 
import os

El módulo **os** que nos permite interactuar con el sistema operativo

In [None]:
nombre_fichero = 'test_file'
# Abrimos el fichero para escritura 
out = open(f'./carpeta/{nombre_fichero}.txt', 'w')

# Escribimos la palabra TheBrigde en este archivo
out.write("TheBridge")

In [None]:
# Debemos cerrar el out
out.close()

In [None]:
# Si queremos guardar el fichero en la carpeta week08
out = open(f'../../week08/{nombre_fichero}.txt', 'w')
out.write("TheBridge")
out.close()

# Abrimos el path o ruta del fichero 

In [None]:
ruta = f'../../week08/{nombre_fichero}.txt'

In [None]:
try:
    with open(ruta, 'r') as entrada:
        pass
except FileNotFoundError as e:
    print(e)

In [None]:
with open(f'carpeta/{nombre_fichero}_2.txt', 'w') as out:
    out.write("TheBridge Data Science" + "\n")

In [None]:
# Probamos a escribir una nueva linea en text file 2
try:
    out.write('otra linea')
except Exception as e:
    print(e)

Con el primer metodo **open** dejamos abierta la conexión hasta su cierre, con el metodo **with** el proceso se cierra de forma automática

El resto de modo para operar con los ficheros: https://docs.python.org/3/library/functions.html#open

In [None]:
# Probamos a escribir una nueva linea en text file 2
try:
    with open(f'carpeta/{nombre_fichero}_2.txt', 'a') as out:
        out.write("otra línea" + "\n")
        out.write('otra línea más' + "\n")
except Exception as e:
    print(e)

In [None]:
# Leer en línea el contenido del fichero
try:
    with open(f'carpeta/{nombre_fichero}_2.txt', 'r') as f:
        contenido = f.read()
except Exception as e:
    print(e)
finally:
    print(contenido)

Lectura de ficheros grande (no exactamente igual que este fichero..)

In [38]:
from sys import getsizeof

In [39]:
big_file = 'carpeta/somehow_big_file.txt'
with open(big_file, 'r') as f:
    content = f.read()
    size_in_bytes = getsizeof(content)
    print(size_in_bytes)

256090


In [42]:
# Más bonito expresado en Kilobytes
big_file = 'carpeta/somehow_big_file.txt'
with open(big_file, 'r') as f:
    content = f.read()
    size_in_bytes = getsizeof(content)
    print(f'el fichero pesa {round(size_in_bytes/ 1024, 3)} KB')

el fichero pesa 250.088 KB


> Crear carpetas

In [53]:
# Utilizamos os.mkdir
new_folder = 'carpeta_ficheros/new_folder'
os.mkdir(new_folder)

> Para poder realizar la creación de subcarpetas, utilizaremos el método iterativo makedirs

In [55]:
try:
    new_subfolder = 'carpeta_ficheros/new_folder/1/2'
    os.makedirs(new_subfolder)
except Exception as e:
    print(e)

In [60]:
!dir carpeta

 El volumen de la unidad C no tiene etiqueta.
 El n�mero de serie del volumen es: A63A-10CF

 Directorio de c:\Users\carlo\Documents\GitHub\The_Bridge_DataScience_PT_ALUMNI_feb22\02_PYTHON\week09\labs\carpeta

31/03/2022  18:25    <DIR>          .
31/03/2022  18:25    <DIR>          ..
31/03/2022  17:34                 0 fichero.py
31/03/2022  18:24           256.041 somehow_big_file.txt
31/03/2022  17:47                 9 test_file.txt
31/03/2022  18:17               303 test_file_2.txt
               4 archivos        256.353 bytes
               2 dirs  187.083.026.432 bytes libres


> En caso de eliminar ficheros

In [61]:
# Utlilizaremos el comando os.remove
nombre_fichero = 'test_file.txt'
ruta = f'carpeta/{nombre_fichero}'
os.remove(ruta)

> En caso de eliminar un directorio vacío

In [62]:
# En este ejemplo eliminaremos solamenta la carpeta 2
ruta_carpeta_vacia = f'carpeta_ficheros/new_folder/1/2'
os.rmdir(ruta_carpeta_vacia)

In [65]:
ruta_carpeta_vacia = f'carpeta_ficheros/new_folder/1'
os.removedirs(ruta_carpeta_vacia)

> en caso de renombrar un fichero

In [66]:
os.rename('carpeta/test_file_2.txt', 'carpeta/fichero_prueba_2.txt')

> Funciones auxiliares de paths

In [67]:
# Unimos diferentes partes de un path con join
path = "/home"
full_path = os.path.join(path, '/thebrige/notebooks/', 'filename.py')
print(full_path)

/thebrige/notebooks/filename.py


In [69]:
# Obtener el directorio dado un fichero
os.path.dirname('carpeta/fichero.py')

'carpeta'

In [70]:
# Obtener la carpeta del working directory
os.getcwd()

'c:\\Users\\carlo\\Documents\\GitHub\\The_Bridge_DataScience_PT_ALUMNI_feb22\\02_PYTHON\\week09\\labs'

In [71]:
# Si queremos seprar la extesión del fichero
os.path.splitext('carpeta/fichero.py')

('carpeta/fichero', '.py')

> Listado de directorios

In [72]:
folder_name = 'carpeta/'
with os.scandir(folder_name) as dir_list:
    for entrada in dir_list:
        print(entrada.name)

fichero.py
fichero_prueba_2.txt
somehow_big_file.txt


In [73]:
# mostramos los ficheros de la carpeta
with os.scandir(folder_name) as dir_list:
    for entrada in dir_list:
        if os.path.isfile(entrada.path):
            print(entrada.name)

fichero.py
fichero_prueba_2.txt
somehow_big_file.txt


> Patrones de Unix Shell

In [77]:
# mostramos nuevamente los ficheros de la carpeta
with os.scandir(folder_name) as dir_list:
    for entry in dir_list:
        if entry.is_file() and entry.name.endswith(".txt"):
            print(entry.name)

fichero_prueba_2.txt
somehow_big_file.txt


In [78]:
# Alternativa que emula el escenario cloud
import glob

In [79]:
glob.glob('carpeta/*.py')

['carpeta\\fichero.py']

### Lectura y ficheros con Pandas
Input/Output
https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html

In [81]:
!pip install pandas

Collecting pandas
  Downloading pandas-1.4.1-cp39-cp39-win_amd64.whl (10.5 MB)
Collecting pytz>=2020.1
  Downloading pytz-2022.1-py2.py3-none-any.whl (503 kB)
Installing collected packages: pytz, pandas
Successfully installed pandas-1.4.1 pytz-2022.1


In [82]:
import pandas as pd

In [83]:
# Cargamos los datos del fichero
ruta = 'carpeta/data_marvel-wikia-data.csv'
dataset = pd.read_csv(ruta)
dataset.head(10)

Unnamed: 0,page_id,name,urlslug,ID,ALIGN,EYE,HAIR,SEX,GSM,ALIVE,APPEARANCES,FIRST APPEARANCE,Year
0,1678,Spider-Man (Peter Parker),\/Spider-Man_(Peter_Parker),Secret Identity,Good Characters,Hazel Eyes,Brown Hair,Male Characters,,Living Characters,4043.0,Aug-62,1962.0
1,7139,Captain America (Steven Rogers),\/Captain_America_(Steven_Rogers),Public Identity,Good Characters,Blue Eyes,White Hair,Male Characters,,Living Characters,3360.0,Mar-41,1941.0
2,64786,"Wolverine (James \""Logan\"" Howlett)",\/Wolverine_(James_%22Logan%22_Howlett),Public Identity,Neutral Characters,Blue Eyes,Black Hair,Male Characters,,Living Characters,3061.0,Oct-74,1974.0
3,1868,"Iron Man (Anthony \""Tony\"" Stark)",\/Iron_Man_(Anthony_%22Tony%22_Stark),Public Identity,Good Characters,Blue Eyes,Black Hair,Male Characters,,Living Characters,2961.0,Mar-63,1963.0
4,2460,Thor (Thor Odinson),\/Thor_(Thor_Odinson),No Dual Identity,Good Characters,Blue Eyes,Blond Hair,Male Characters,,Living Characters,2258.0,Nov-50,1950.0
5,2458,Benjamin Grimm (Earth-616),\/Benjamin_Grimm_(Earth-616),Public Identity,Good Characters,Blue Eyes,No Hair,Male Characters,,Living Characters,2255.0,Nov-61,1961.0
6,2166,Reed Richards (Earth-616),\/Reed_Richards_(Earth-616),Public Identity,Good Characters,Brown Eyes,Brown Hair,Male Characters,,Living Characters,2072.0,Nov-61,1961.0
7,1833,Hulk (Robert Bruce Banner),\/Hulk_(Robert_Bruce_Banner),Public Identity,Good Characters,Brown Eyes,Brown Hair,Male Characters,,Living Characters,2017.0,May-62,1962.0
8,29481,Scott Summers (Earth-616),\/Scott_Summers_(Earth-616),Public Identity,Neutral Characters,Brown Eyes,Brown Hair,Male Characters,,Living Characters,1955.0,Sep-63,1963.0
9,1837,Jonathan Storm (Earth-616),\/Jonathan_Storm_(Earth-616),Public Identity,Good Characters,Blue Eyes,Blond Hair,Male Characters,,Living Characters,1934.0,Nov-61,1961.0


In [85]:
dataset.sample(5, random_state = 17)

Unnamed: 0,page_id,name,urlslug,ID,ALIGN,EYE,HAIR,SEX,GSM,ALIVE,APPEARANCES,FIRST APPEARANCE,Year
11599,548362,Prunella Hawkins (Earth-616),\/Prunella_Hawkins_(Earth-616),Secret Identity,Bad Characters,,Grey Hair,Female Characters,,Living Characters,1.0,Jan-49,1949.0
13172,289020,Monte Devlin (Earth-616),\/Monte_Devlin_(Earth-616),Secret Identity,Bad Characters,,,Male Characters,,Living Characters,1.0,Aug-92,1992.0
5983,706138,Godfried Herter (Earth-616),\/Godfried_Herter_(Earth-616),Secret Identity,Bad Characters,Brown Eyes,Grey Hair,Male Characters,,Living Characters,4.0,Jan-71,1971.0
2762,43764,Flea (Earth-616),\/Flea_(Earth-616),,Neutral Characters,,Black Hair,Male Characters,,Deceased Characters,12.0,Jan-73,1973.0
8706,248767,Erik Gorbo (Earth-616),\/Erik_Gorbo_(Earth-616),,Bad Characters,,,Male Characters,,Deceased Characters,2.0,Mar-71,1971.0


In [86]:
# Si quiero ver las últimas filas
dataset.tail()

Unnamed: 0,page_id,name,urlslug,ID,ALIGN,EYE,HAIR,SEX,GSM,ALIVE,APPEARANCES,FIRST APPEARANCE,Year
16371,657508,Ru'ach (Earth-616),\/Ru%27ach_(Earth-616),No Dual Identity,Bad Characters,Green Eyes,No Hair,Male Characters,,Living Characters,,,
16372,665474,Thane (Thanos' son) (Earth-616),\/Thane_(Thanos%27_son)_(Earth-616),No Dual Identity,Good Characters,Blue Eyes,Bald,Male Characters,,Living Characters,,,
16373,695217,Tinkerer (Skrull) (Earth-616),\/Tinkerer_(Skrull)_(Earth-616),Secret Identity,Bad Characters,Black Eyes,Bald,Male Characters,,Living Characters,,,
16374,708811,TK421 (Spiderling) (Earth-616),\/TK421_(Spiderling)_(Earth-616),Secret Identity,Neutral Characters,,,Male Characters,,Living Characters,,,
16375,673702,Yologarch (Earth-616),\/Yologarch_(Earth-616),,Bad Characters,,,,,Living Characters,,,


In [88]:
# El shape devuelve el formato del dataset (n filas, m columnas)
dataset.shape

(16376, 13)

In [89]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16376 entries, 0 to 16375
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   page_id           16376 non-null  int64  
 1   name              16376 non-null  object 
 2   urlslug           16376 non-null  object 
 3   ID                12606 non-null  object 
 4   ALIGN             13564 non-null  object 
 5   EYE               6609 non-null   object 
 6   HAIR              12112 non-null  object 
 7   SEX               15522 non-null  object 
 8   GSM               90 non-null     object 
 9   ALIVE             16373 non-null  object 
 10  APPEARANCES       15280 non-null  float64
 11  FIRST APPEARANCE  15561 non-null  object 
 12  Year              15561 non-null  float64
dtypes: float64(2), int64(1), object(10)
memory usage: 1.6+ MB


In [90]:
# Los estadísticos del dataset
dataset.describe()

Unnamed: 0,page_id,APPEARANCES,Year
count,16376.0,15280.0,15561.0
mean,300232.082377,17.033377,1984.951803
std,253460.403399,96.372959,19.663571
min,1025.0,1.0,1939.0
25%,28309.5,1.0,1974.0
50%,282578.0,3.0,1990.0
75%,509077.0,8.0,2000.0
max,755278.0,4043.0,2013.0


In [92]:
# Conocer las columnas
dataset.columns

Index(['page_id', 'name', 'urlslug', 'ID', 'ALIGN', 'EYE', 'HAIR', 'SEX',
       'GSM', 'ALIVE', 'APPEARANCES', 'FIRST APPEARANCE', 'Year'],
      dtype='object')