This function `extract_save` can extract data from the XML file used to backup messages. It extracts:

1. Body of the message in English or Hindi
2. Date stamp
3. Date Sent
4. Service Center Number
5. Human-readable Date & Time
6. Message Title

In [1]:
import xml.etree.ElementTree as ET
import pandas as pd
import csv

def _extract_sms_data(xml_file_path):
    # Parse the XML file with explicit UTF-8 encoding
    try:
        with open(xml_file_path, encoding='utf-8') as file:
            tree = ET.parse(file)
            root = tree.getroot()
    except FileNotFoundError:
        print(f"Error: File {xml_file_path} not found.")
        return None
    except ET.ParseError:
        print("Error: Invalid XML format.")
        return None
    except UnicodeDecodeError:
        print("Error: Unable to decode file with UTF-8 encoding.")
        return None

    # List to store extracted data
    data = []

    # Define the properties to extract
    properties = ['address', 'body', 'readable_date', 'service_center', 'date', 'date_sent']

    # Iterate through each <sms> element
    for sms in root.findall('sms'):
        # Extract specified attributes, use None if attribute is missing
        sms_data = {prop: sms.get(prop, None) for prop in properties}
        data.append(sms_data)

    # Create DataFrame
    df = pd.DataFrame(data, columns=properties)
    return df

def extract_save(xml_file_path, save_path='sms_data.csv'):
    df = _extract_sms_data(xml_file_path)
    print(df)
    if df is not None:
        # Save DataFrame to CSV with UTF-8 encoding to preserve Hindi text
        df.to_csv(save_path, index=False, encoding='utf-8-sig')
        print(f"\nDataFrame saved to {save_path} with UTF-8 encoding")

In [2]:
# extract_save('sms-20250722210237.xml','one.csv')
# extract_save('sms-20250724144910.xml','two.csv')

In [5]:
import pandas as pd
df = _extract_sms_data('sms-20250722210237.xml')
df.head()

Unnamed: 0,address,body,readable_date,service_center,date,date_sent
0,JM-PAYZAP-S,Login attempt on your PayZapp account at 06:52...,15 May 2025 7:32:18 pm,917021075036,1747317738643,1747317736000
1,51501,971115 Message ID: 0PYGGQiwfmU,15 May 2025 7:48:36 pm,918299901123,1747318716822,1747318715000
2,VM-HDFCBK-S,Sent Rs.420.00\nFrom HDFC Bank A/C *9442\nTo S...,15 May 2025 8:51:33 pm,919823000122,1747322493534,1747322491000
3,AE-AIRTEL-P,फ्री हैलोट्यून आपका इंतज़ार कर रही है!\nआज ही ए...,16 May 2025 12:37:35 pm,919831029607,1747379255850,1747379253000
4,AE-AIRSLF,"नमस्ते, अपना करेंट पैक डिटेल घर बैठे एयरटेल थै...",16 May 2025 2:23:40 pm,919840011991,1747385620009,1747385618000


In [10]:
!pip install langdetect pandas

Collecting langdetect
  Using cached langdetect-1.0.9.tar.gz (981 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py): started
  Building wheel for langdetect (setup.py): finished with status 'done'
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993250 sha256=ffbc2be2380c874a94f6d83019df723b805e3436e8ac48bb7115b5cf6236403c
  Stored in directory: c:\users\mhsuh\appdata\local\pip\cache\wheels\c1\67\88\e844b5b022812e15a52e4eaa38a1e709e99f06f6639d7e3ba7
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9
Note: you may need to restart the kernel to use updated packages.


  DEPRECATION: Building 'langdetect' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'langdetect'. Discussion can be found at https://github.com/pypa/pip/issues/6334

[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: C:\Python312\python.exe -m pip install --upgrade pip


In [19]:
import re
def clean_message(text):
    text = str(text)
    print('\n',text,'\n')
    text = re.sub(r'\d{4,}', ' [NUM] ', text) # replace long numbers
    # print(text)
    text = re.sub(r'http\S+', ' [URL] ', text) # replace URLs
    # print(text)
    text = re.sub(r'[^\w\s\u0900-\u097F]', '', text) # Remove special char except hindi
    # print(text)
    text = re.sub(r'\s+', ' ', text) # normalize spaces
    print('\n',text,'\n')
    return text.strip()

from langdetect import detect
def detect_language(text):
    try:
        lang = detect(text)
        return lang # ex. 'en', 'hi'
    except:
        return 'unknown'

# print(df['body'].iloc[1])

df['cleaned_message'] = df['body'].apply(clean_message)
df['language'] = df['body'].apply(detect_language)
df.head()

    


 Login attempt on your PayZapp account at 06:52 PM.
If it wasn't you, contact customer support: https://hdfcbk.io/HDFCBK/s/B5LJXVBd 


 Login attempt on your PayZapp account at 0652 PM If it wasnt you contact customer support URL  


 971115 Message ID: 0PYGGQiwfmU 


  NUM Message ID 0PYGGQiwfmU 


 Sent Rs.420.00
From HDFC Bank A/C *9442
To SUNY
On 15/05/25
Ref 513576240499
Not You?
Call 18002586161/SMS BLOCK UPI to 7308080808
 


 Sent Rs42000 From HDFC Bank AC NUM To SUNY On 150525 Ref NUM Not You Call NUM SMS BLOCK UPI to NUM  


 फ्री हैलोट्यून आपका इंतज़ार कर रही है!
आज ही एयरटेल थैंक्स ऐप पर सेट करें| i.airtel.in/Free_hellotune 


 फ्री हैलोट्यून आपका इंतज़ार कर रही है आज ही एयरटेल थैंक्स ऐप पर सेट करें iairtelinFree_hellotune 


 नमस्ते, अपना करेंट पैक डिटेल घर बैठे एयरटेल थैंक्स ऐप पर देखें। https://i.airtel.in/prc लिंक पर क्लिक करें 


 नमस्ते अपना करेंट पैक डिटेल घर बैठे एयरटेल थैंक्स ऐप पर देखें। URL लिंक पर क्लिक करें 


 A/c *9456 Debited for Rs:10.00 on 16-05-2025 18:08:

Unnamed: 0,address,body,readable_date,service_center,date,date_sent,cleaned_message,language
0,JM-PAYZAP-S,Login attempt on your PayZapp account at 06:52...,15 May 2025 7:32:18 pm,917021075036,1747317738643,1747317736000,Login attempt on your PayZapp account at 0652 ...,en
1,51501,971115 Message ID: 0PYGGQiwfmU,15 May 2025 7:48:36 pm,918299901123,1747318716822,1747318715000,NUM Message ID 0PYGGQiwfmU,de
2,VM-HDFCBK-S,Sent Rs.420.00\nFrom HDFC Bank A/C *9442\nTo S...,15 May 2025 8:51:33 pm,919823000122,1747322493534,1747322491000,Sent Rs42000 From HDFC Bank AC NUM To SUNY On ...,en
3,AE-AIRTEL-P,फ्री हैलोट्यून आपका इंतज़ार कर रही है!\nआज ही ए...,16 May 2025 12:37:35 pm,919831029607,1747379255850,1747379253000,फ्री हैलोट्यून आपका इंतज़ार कर रही है आज ही एयर...,hi
4,AE-AIRSLF,"नमस्ते, अपना करेंट पैक डिटेल घर बैठे एयरटेल थै...",16 May 2025 2:23:40 pm,919840011991,1747385620009,1747385618000,नमस्ते अपना करेंट पैक डिटेल घर बैठे एयरटेल थैं...,hi


In [20]:
!pip install nltk indic-nlp-library

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting indic-nlp-library
  Downloading indic_nlp_library-0.92-py3-none-any.whl.metadata (5.7 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2025.7.34-cp312-cp312-win_amd64.whl.metadata (41 kB)
Collecting sphinx-argparse (from indic-nlp-library)
  Downloading sphinx_argparse-0.5.2-py3-none-any.whl.metadata (3.7 kB)
Collecting sphinx-rtd-theme (from indic-nlp-library)
  Downloading sphinx_rtd_theme-3.0.2-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting morfessor (from indic-nlp-library)
  Downloading Morfessor-2.0.6-py3-none-any.whl.metadata (628 bytes)
Collecting sphinx>=5.1.0 (from sphinx-argparse->indic-nlp-library)
  Downloading sphinx-8.2.3-py3-none-any.whl.metadata (7.0 kB)
Collecting docutils>=0.19 (from sphinx-argparse->indic-nlp-library)
  Downloading docutils-0.22-py3-none-any.whl.metadata (15 kB)
Collecting sphinxcontrib-applehelp>=1.0.7 (from sphinx>=5.1.0->sphinx-argparse->indic-


[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: C:\Python312\python.exe -m pip install --upgrade pip


In [50]:
print({x: df[df['language'] == x].shape[0] for x in df['language'].unique()})
df[df['language'] == 'id']

{'en': 263, 'de': 5, 'hi': 22, 'so': 1, 'id': 4, 'tl': 1, 'sw': 1}


Unnamed: 0,address,body,readable_date,service_center,date,date_sent,cleaned_message,language,eng_tokens,hi_tokens
45,AE-AIRMCA-S,"Jis vyakti ko aap 10:30 par call kar rahe the,...",20 May 2025 10:35:12 am,919810051688,1747717512070,1747717505000,Jis vyakti ko aap 1030 par call kar rahe the v...,id,,
153,AE-AIRMCA-S,"Jis vyakti ko aap 19:33 par call kar rahe the,...",11 Jun 2025 8:00:57 pm,919845060893,1749652257789,1749652256000,Jis vyakti ko aap 1933 par call kar rahe the v...,id,,
214,AE-AIRMCA-S,"Jis vyakti ko aap 21:15 par call kar rahe the,...",27 Jun 2025 12:38:24 am,919810051688,1750964904184,1750964903000,Jis vyakti ko aap 2115 par call kar rahe the v...,id,,
229,AE-AIRMCA-S,"Jis vyakti ko aap 22:42 par call kar rahe the,...",29 Jun 2025 11:05:36 pm,919810051688,1751218536960,1751218535000,Jis vyakti ko aap 2242 par call kar rahe the v...,id,,


In [24]:
import nltk
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize
df['eng_tokens'] = df[df['language'] == 'en']['cleaned_message'].apply(word_tokenize)
df.head()


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\mhsuh\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


Unnamed: 0,address,body,readable_date,service_center,date,date_sent,cleaned_message,language,eng_tokens
0,JM-PAYZAP-S,Login attempt on your PayZapp account at 06:52...,15 May 2025 7:32:18 pm,917021075036,1747317738643,1747317736000,Login attempt on your PayZapp account at 0652 ...,en,"[Login, attempt, on, your, PayZapp, account, a..."
1,51501,971115 Message ID: 0PYGGQiwfmU,15 May 2025 7:48:36 pm,918299901123,1747318716822,1747318715000,NUM Message ID 0PYGGQiwfmU,de,
2,VM-HDFCBK-S,Sent Rs.420.00\nFrom HDFC Bank A/C *9442\nTo S...,15 May 2025 8:51:33 pm,919823000122,1747322493534,1747322491000,Sent Rs42000 From HDFC Bank AC NUM To SUNY On ...,en,"[Sent, Rs42000, From, HDFC, Bank, AC, NUM, To,..."
3,AE-AIRTEL-P,फ्री हैलोट्यून आपका इंतज़ार कर रही है!\nआज ही ए...,16 May 2025 12:37:35 pm,919831029607,1747379255850,1747379253000,फ्री हैलोट्यून आपका इंतज़ार कर रही है आज ही एयर...,hi,
4,AE-AIRSLF,"नमस्ते, अपना करेंट पैक डिटेल घर बैठे एयरटेल थै...",16 May 2025 2:23:40 pm,919840011991,1747385620009,1747385618000,नमस्ते अपना करेंट पैक डिटेल घर बैठे एयरटेल थैं...,hi,


In [32]:
!pip install indic-nlp-library

Collecting indic-nlp-library
  Using cached indic_nlp_library-0.92-py3-none-any.whl.metadata (5.7 kB)
Collecting sphinx-argparse (from indic-nlp-library)
  Using cached sphinx_argparse-0.5.2-py3-none-any.whl.metadata (3.7 kB)
Collecting sphinx-rtd-theme (from indic-nlp-library)
  Using cached sphinx_rtd_theme-3.0.2-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting morfessor (from indic-nlp-library)
  Using cached Morfessor-2.0.6-py3-none-any.whl.metadata (628 bytes)
Collecting sphinx>=5.1.0 (from sphinx-argparse->indic-nlp-library)
  Using cached sphinx-8.2.3-py3-none-any.whl.metadata (7.0 kB)
Collecting docutils>=0.19 (from sphinx-argparse->indic-nlp-library)
  Using cached docutils-0.22-py3-none-any.whl.metadata (15 kB)
Collecting sphinxcontrib-applehelp>=1.0.7 (from sphinx>=5.1.0->sphinx-argparse->indic-nlp-library)
  Using cached sphinxcontrib_applehelp-2.0.0-py3-none-any.whl.metadata (2.3 kB)
Collecting sphinxcontrib-devhelp>=1.0.6 (from sphinx>=5.1.0->sphinx-argparse->indic-nlp-l


[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [35]:
from indicnlp.tokenize import indic_tokenize
def tokenize_hindi(text):
    return list(indic_tokenize.trivial_tokenize(text, lang='hi'))

df['hi_tokens'] = df[df['language'] == 'hi']['cleaned_message'].apply(tokenize_hindi)
df.head()

Unnamed: 0,address,body,readable_date,service_center,date,date_sent,cleaned_message,language,eng_tokens,hi_tokens
0,JM-PAYZAP-S,Login attempt on your PayZapp account at 06:52...,15 May 2025 7:32:18 pm,917021075036,1747317738643,1747317736000,Login attempt on your PayZapp account at 0652 ...,en,"[Login, attempt, on, your, PayZapp, account, a...",
1,51501,971115 Message ID: 0PYGGQiwfmU,15 May 2025 7:48:36 pm,918299901123,1747318716822,1747318715000,NUM Message ID 0PYGGQiwfmU,de,,
2,VM-HDFCBK-S,Sent Rs.420.00\nFrom HDFC Bank A/C *9442\nTo S...,15 May 2025 8:51:33 pm,919823000122,1747322493534,1747322491000,Sent Rs42000 From HDFC Bank AC NUM To SUNY On ...,en,"[Sent, Rs42000, From, HDFC, Bank, AC, NUM, To,...",
3,AE-AIRTEL-P,फ्री हैलोट्यून आपका इंतज़ार कर रही है!\nआज ही ए...,16 May 2025 12:37:35 pm,919831029607,1747379255850,1747379253000,फ्री हैलोट्यून आपका इंतज़ार कर रही है आज ही एयर...,hi,,"[फ्री, हैलोट्यून, आपका, इंतज़ार, कर, रही, है, आ..."
4,AE-AIRSLF,"नमस्ते, अपना करेंट पैक डिटेल घर बैठे एयरटेल थै...",16 May 2025 2:23:40 pm,919840011991,1747385620009,1747385618000,नमस्ते अपना करेंट पैक डिटेल घर बैठे एयरटेल थैं...,hi,,"[नमस्ते, अपना, करेंट, पैक, डिटेल, घर, बैठे, एय..."


## Computing Cost

Gradient descent involves repeated steps to adjust the value of your parameter $(w,b)$ to gradually get a smaller and smaller cost $J(w,b)$.
- At each step of gradient descent, it will be helpful for you to monitor your progress by computing the cost $J(w,b)$ as $(w,b)$ gets updated. 
- In this section, you will implement a function to calculate $J(w,b)$ so that you can check the progress of your gradient descent implementation.

#### Cost function
As you may recall from the lecture, for one variable, the cost function for linear regression $J(w,b)$ is defined as

$$J(w,b) = \frac{1}{2m} \sum\limits_{i = 0}^{m-1} (f_{w,b}(x^{(i)}) - y^{(i)})^2$$ 

- You can think of $f_{w,b}(x^{(i)})$ as the model's prediction of your restaurant's profit, as opposed to $y^{(i)}$, which is the actual profit that is recorded in the data.
- $m$ is the number of training examples in the dataset

#### Model prediction

- For linear regression with one variable, the prediction of the model $f_{w,b}$ for an example $x^{(i)}$ is representented as:

$$ f_{w,b}(x^{(i)}) = wx^{(i)} + b$$

This is the equation for a line, with an intercept $b$ and a slope $w$

In [7]:
def compute_cost(x, y, w, b): 
    """
    Computes the cost function for linear regression.
    
    Args:
        x (ndarray): Shape (m,) Input to the model (Population of cities) 
        y (ndarray): Shape (m,) Label (Actual profits for the cities)
        w, b (scalar): Parameters of the model
    
    Returns
        total_cost (float): The cost of using w,b as the parameters for linear regression
               to fit the data points in x and y
    """
    # number of training examples
    m = x.shape[0] 
    
    # You need to return this variable correctly
    total_cost = 0
    f_wb = w * x + b
    error = f_wb - y
    sq_error = error ** 2
    total_error = np.sum(sq_error)
    total_cost = total_error/(2*m)

    return total_cost

## Gradient descent
The gradient descent algorithm is:

$$\begin{align*}& \text{repeat until convergence:} \; \lbrace \newline \; & \phantom {0000} b := b -  \alpha \frac{\partial J(w,b)}{\partial b} \newline       \; & \phantom {0000} w := w -  \alpha \frac{\partial J(w,b)}{\partial w} \tag{1}  \; & 
\newline & \rbrace\end{align*}$$

where, parameters $w, b$ are both updated simultaniously and where  
$$
\frac{\partial J(w,b)}{\partial b}  = \frac{1}{m} \sum\limits_{i = 0}^{m-1} (f_{w,b}(x^{(i)}) - y^{(i)}) \tag{2}
$$
$$
\frac{\partial J(w,b)}{\partial w}  = \frac{1}{m} \sum\limits_{i = 0}^{m-1} (f_{w,b}(x^{(i)}) -y^{(i)})x^{(i)} \tag{3}
$$
* m is the number of training examples in the dataset

    
*  $f_{w,b}(x^{(i)})$ is the model's prediction, while $y^{(i)}$, is the target value

* `compute_gradient` function calculates $\frac{\partial J(w)}{\partial w}$, $\frac{\partial J(w)}{\partial b}$ 

In [8]:
def compute_gradient(x, y, w, b): 
    """
    Computes the gradient for linear regression 
    Args:
      x (ndarray): Shape (m,) Input to the model (Population of cities) 
      y (ndarray): Shape (m,) Label (Actual profits for the cities)
      w, b (scalar): Parameters of the model  
    Returns
      dj_dw (scalar): The gradient of the cost w.r.t. the parameters w
      dj_db (scalar): The gradient of the cost w.r.t. the parameter b     
     """
    
    # Number of training examples
    m = x.shape[0]
    
    # You need to return the following variables correctly
    dj_dw = 0
    dj_db = 0
    gradient = ( ( w * x + b ) - y )
    dj_db = np.sum(gradient)/m
    gradient = gradient * x
    dj_dw = np.sum(gradient)/m
    return dj_dw, dj_db

### Learning parameters using batch gradient descent

In [10]:
def gradient_descent(x, y, w_in, b_in, cost_function, gradient_function, alpha, num_iters): 
    """
    Performs batch gradient descent to learn theta. Updates theta by taking 
    num_iters gradient steps with learning rate alpha
    
    Args:
      x :    (ndarray): Shape (m,)
      y :    (ndarray): Shape (m,)
      w_in, b_in : (scalar) Initial values of parameters of the model
      cost_function: function to compute cost
      gradient_function: function to compute the gradient
      alpha : (float) Learning rate
      num_iters : (int) number of iterations to run gradient descent
    Returns
      w : (ndarray): Shape (1,) Updated values of parameters of the model after
          running gradient descent
      b : (scalar)                Updated value of parameter of the model after
          running gradient descent
    """
    
    # number of training examples
    m = len(x)
    
    # An array to store cost J and w's at each iteration — primarily for graphing later
    J_history = []
    w_history = []
    w = copy.deepcopy(w_in)  #avoid modifying global w within function
    b = b_in
    
    for i in range(num_iters):

        # Calculate the gradient and update the parameters
        dj_dw, dj_db = gradient_function(x, y, w, b )  

        # Update Parameters using w, b, alpha and gradient
        w = w - alpha * dj_dw               
        b = b - alpha * dj_db               

        # Save cost J at each iteration
        if i<100000:      # prevent resource exhaustion 
            cost =  cost_function(x, y, w, b)
            J_history.append(cost)

        # Print cost every at intervals 10 times or as many iterations if < 10
        if i% math.ceil(num_iters/10) == 0:
            w_history.append(w)
            print(f"Iteration {i:4}: Cost {float(J_history[-1]):8.2f}   ")
        
    return w, b, J_history, w_history #return w and J,w history for graphing