<a href="https://colab.research.google.com/github/noelmathen/RSMS-CGPA-and-SGPA-Scraper/blob/main/CGPA_and_SGPA_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Extracting CGPA

In [None]:
# Import necessary modules
import requests
from bs4 import BeautifulSoup
import regex as re
import pandas as pd
from google.colab import files

# Upload the Excel file
uploaded = files.upload()

# Load the uploaded file into a DataFrame
df = pd.read_excel(next(iter(uploaded)))

# Initialize an empty list to store CGPA values
CGPA = []

# Function to extract CGPA for a student from the website
def extract_cgpa(person):
    try:
        login_url = "https://www.rajagiritech.ac.in/stud/ktu/student/varify.asp"

        # Use requests session for maintaining state between requests
        with requests.session() as s:
            req = s.get(login_url, timeout=10)
            req.raise_for_status()  # Check for request errors

            # Data payload for the POST request with UID and Password
            payload = {
                "Userid": person['UID'],
                "Password": person['Password']
            }

            # Sending POST request for login
            res = s.post(login_url, data=payload)
            res.raise_for_status()  # Check for errors

            # Now fetch the page with CGPA information
            new_url = "https://www.rajagiritech.ac.in/stud/ktu/Student/Marks_Rexa.asp"
            new_req = s.get(new_url, timeout=10)
            new_req.raise_for_status()

            # Parsing the page content
            soup = BeautifulSoup(new_req.content, 'lxml')
            td_tags = soup.find_all('td')
            cgpa = None

            # Regular expression to match CGPA value
            pattern = re.compile(r'CGPA:\s*([\d.]+)')
            for td in td_tags:
                match = pattern.search(td.text)
                if match:
                    cgpa = match.group(1)

            if cgpa:
                CGPA.append(float(cgpa))  # Add the CGPA if found
            else:
                CGPA.append(None)  # CGPA not found
    except requests.exceptions.RequestException as e:
        print(f"Error for {person['UID']}: {e}")
        CGPA.append(None)  # Handle errors gracefully

# Display a message
print("Extracting data (CGPA) from RSMS...")

# Apply the extract_cgpa function to each row in the DataFrame
df.apply(extract_cgpa, axis=1)

# Remove the Password column and add the CGPA values
del df['Password']
df['CGPA'] = CGPA

# Insert Roll Number column and reorder columns
df.insert(0, 'Roll Number', range(1, len(df) + 1))
df = df[['Roll Number', 'UID', 'Name', 'CGPA']]

# Save the updated DataFrame to an Excel file
df.to_excel('CGPA_File.xlsx', index=False)

# Allow the user to download the resulting file
files.download('CGPA_File.xlsx')

# Print completion message
print("\nCGPA extraction completed! Check the 'CGPA_File.xlsx' for the results.")


Saving CSBS_UID_Password_List.xlsx to CSBS_UID_Password_List.xlsx
Extracting data (CGPA) from RSMS...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


CGPA extraction completed! Check the 'CGPA_File.xlsx' for the results.


## Extracting SGPA

In [None]:
# Import necessary modules
import requests
from bs4 import BeautifulSoup
import regex as re
import pandas as pd
from google.colab import files

# Upload the Excel file
uploaded = files.upload()

# Load the uploaded file into a DataFrame
df = pd.read_excel(next(iter(uploaded)))

# Initialize an empty list to store SGPA values
SGPA = []

# Function to extract SGPA for a student from the website
def extract_sgpa(person):
    try:
        login_url = "https://www.rajagiritech.ac.in/stud/ktu/student/varify.asp"

        # Use requests session for maintaining state between requests
        with requests.session() as s:
            req = s.get(login_url, timeout=10)
            req.raise_for_status()  # Check for request errors

            # Data payload for the POST request with UID and Password
            payload = {
                "Userid": person['UID'],
                "Password": person['Password']
            }

            # Sending POST request for login
            res = s.post(login_url, data=payload)
            res.raise_for_status()  # Check for errors

            # Now fetch the page with SGPA information
            new_url = "https://www.rajagiritech.ac.in/stud/ktu/Student/Marks_Rexa.asp"
            new_req = s.get(new_url, timeout=10)
            new_req.raise_for_status()

            # Parsing the page content
            soup = BeautifulSoup(new_req.content, 'lxml')
            td_tags = soup.find_all('td')
            sgpa = None

            # Regular expression to match SGPA value
            pattern = re.compile(r'SGPA:\s*([\d.]+)')
            for td in td_tags:
                match = pattern.search(td.text)
                if match:
                    sgpa = match.group(1)

            if sgpa:
                SGPA.append(float(sgpa))  # Add the SGPA if found
            else:
                SGPA.append(None)  # SGPA not found
    except requests.exceptions.RequestException as e:
        print(f"Error for {person['UID']}: {e}")
        SGPA.append(None)  # Handle errors gracefully

# Display a message
print("Extracting data (SGPA) from RSMS...")

# Apply the extract_sgpa function to each row in the DataFrame
df.apply(extract_sgpa, axis=1)

# Remove the Password column and add the SGPA values
del df['Password']
df['SGPA'] = SGPA

# Insert Roll Number column and reorder columns
df.insert(0, 'Roll Number', range(1, len(df) + 1))
df = df[['Roll Number', 'UID', 'Name', 'SGPA']]

# Save the updated DataFrame to an Excel file
df.to_excel('SGPA_File.xlsx', index=False)

# Allow the user to download the resulting file
files.download('SGPA_File.xlsx')

# Print completion message
print("\nSGPA extraction completed! Check the 'SGPA_File.xlsx' for the results.")


Saving CSBS_UID_Password_List.xlsx to CSBS_UID_Password_List (1).xlsx
Extracting data (SGPA) from RSMS...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


SGPA extraction completed! Check the 'SGPA_File.xlsx' for the results.
