In [22]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os

In [37]:
current_directory = os.getcwd()

# Join the current directory path with the folder name
data_fp = os.path.join(current_directory, "data")

# Check if the folder already exists
if not os.path.exists(data_fp):
    # Create the folder if it doesn't exist
    os.makedirs(data_fp)

In [31]:
# https://medium.com/@chipk215/web-scraping-a-story-of-preformatted-text-df65486a8f15

url = 'https://www.icess.ucsb.edu/modis/EMIS/images/oakface.prn'  # Replace this with the website URL

# Get the sample type from the URL
base_name = os.path.basename(url)

# Remove the file extension
sampletype = os.path.splitext(base_name)[0]

print(sampletype)


oakface


In [26]:
page_to_scrape = requests.get(url)
soup = BeautifulSoup(page_to_scrape.text, "html.parser") # Get all the contents of site as bs4 object

page_text = soup.text.strip() # Strip just the info and store as text
# print(page_text)

In [27]:
# Find the index where the table begins
table_start_index = page_text.find('#X(Micrometer)')

if table_start_index != -1:
    # Extract the table content starting from the table_start_index
    table_content = page_text[table_start_index:]

    print(table_content)
else:
    print("Table starting from '#X(Micrometer)' not found in the text.")

#X(Micrometer)  #X(cm-1)        #Yvalue
14.5588         686.869         .973462
14.4775         690.728         .974855
14.397          694.587         .975169
14.3175         698.445         .974131
14.2388         702.304         .975639
14.161          706.163         .972621
14.0841         710.022         .973456
14.0079         713.881         .973051
13.9326         717.739         .97368
13.8581         721.598         .972855
13.7844         725.457         .972774
13.7115         729.316         .971533
13.6393         733.175         .974574
13.5679         737.033         .97245
13.4972         740.892         .972386
13.4273         744.751         .971772
13.3581         748.61          .973089
13.2896         752.469         .973007
13.2218         756.328         .974443
13.1547         760.186         .973797
13.0882         764.045         .973485
13.0224         767.904         .972467
12.9573         771.763         .973287
12.8929         775.622         .972859
12

In [28]:
# Split the table content into rows
rows = table_content.strip().split('\n')

# Split each row by whitespace and create a list of lists for the data
data = [list(map(float, row.split())) for row in rows[1:]]  # Skip the header row

# Create a DataFrame
columns = rows[0].split()  # Use the header row as column names
df = pd.DataFrame(data, columns=columns)

print(df)

     #X(Micrometer)  #X(cm-1)   #Yvalue
0          14.55880   686.869  0.973462
1          14.47750   690.728  0.974855
2          14.39700   694.587  0.975169
3          14.31750   698.445  0.974131
4          14.23880   702.304  0.975639
..              ...       ...       ...
595         3.35248  2982.860  0.972688
596         3.34815  2986.720  0.972659
597         3.34383  2990.580  0.973086
598         3.33952  2994.440  0.969277
599         3.33522  2998.300  0.972906

[600 rows x 3 columns]


In [41]:
output_fp = os.path.join(data_fp, sampletype + ".csv")
df.to_csv(output_fp, index = False)