In [1]:
import os
import glob
import pandas as pd

In [2]:
# --- USER: set the folder that contains all your .txt files ---
DATA_DIR = r"C:\Users\Talha\Documents\DataScience\Data Mining\Project\DataMining-Coursework\combined"

# helpful checks
print("Current working directory:", os.getcwd())
print("Checking DATA_DIR exists:", DATA_DIR)
if not os.path.isdir(DATA_DIR):
    raise NotADirectoryError(f"Path does not exist or is not a directory: {DATA_DIR}")

# find all .txt files (recursive)
txt_files = sorted(glob.glob(os.path.join(DATA_DIR, "**", "*.txt"), recursive=True))

#os.path.join(DATA_DIR, "**", "*.txt")
#os.path.join safely builds a file path that works on all operating systems (Windows, Linux, Mac).
#DATA_DIR is the main folder where all your text files are stored.
#"**" means “search in all subdirectories of this folder as well” (when recursive=True is used).
#"*.txt" means match all files that end with .txt.

#this will print the number of text files and the folder name in which all files are present
print(f"\nFound {len(txt_files)} .txt files under: {DATA_DIR}\n")

# show up to first 30 filenames (basename only)
sample = [os.path.basename(p) for p in txt_files[:30]]
 #os.path.basename(path) extracts only the filename part of a full path.
    #For example: os.path.basename("C:/GestureData/person1/gesture1.txt")
    # → "gesture1.txt"

for i, name in enumerate(sample, 1):
    print(f"{i:2d}. {name}") 
    #enumerate loops over the list but also gives you an index for each element.
    #The 1 means the index starts at 1 (instead of the default 0).
    #This is formatted printing.
    #f"{i:2d}" means print the number i as a 2-digit integer, aligned nicely.
    #This is just to visually inspect that your files were read correctly.

# Save a file index for traceability
index_df = pd.DataFrame({"filepath": txt_files, "filename": [os.path.basename(p) for p in txt_files]})
out_index = os.path.join(DATA_DIR, "file_index.csv")
index_df.to_csv(out_index, index=False)
print(f"\nSaved file index to: {out_index}")

Current working directory: C:\Users\Talha\Documents\DataScience\Data Mining\Project\DataMining-Coursework
Checking DATA_DIR exists: C:\Users\Talha\Documents\DataScience\Data Mining\Project\DataMining-Coursework\combined

Found 2700 .txt files under: C:\Users\Talha\Documents\DataScience\Data Mining\Project\DataMining-Coursework\combined

 1. afternoon_apurve_1.txt
 2. afternoon_apurve_2.txt
 3. afternoon_apurve_3.txt
 4. afternoon_apurve_4.txt
 5. afternoon_apurve_5.txt
 6. afternoon_apurve_6.txt
 7. afternoon_apurve_7.txt
 8. afternoon_apurve_8.txt
 9. afternoon_apurve_9.txt
10. afternoon_gautam_1.txt
11. afternoon_gautam_2.txt
12. afternoon_gautam_3.txt
13. afternoon_gautam_4.txt
14. afternoon_gautam_5.txt
15. afternoon_gautam_6.txt
16. afternoon_gautam_7.txt
17. afternoon_gautam_8.txt
18. afternoon_gautam_9.txt
19. afternoon_mahendra_1.txt
20. afternoon_mahendra_2.txt
21. afternoon_mahendra_3.txt
22. afternoon_mahendra_4.txt
23. afternoon_mahendra_5.txt
24. afternoon_mahendra_6.txt
2

# Parse Labels (Extract Gesture Names from Filenames)

In [6]:


#this will read all the files in the folder and subfolders and find the file_index.csv file by itself
#print(glob.glob("**/file_index.csv", recursive=True))

file_index= pd.read_csv("combined/file_index.csv")
print(file_index.head())

# Extract gesture label from the filename (before the first underscore)
file_index['label']= file_index['filename'].apply(lambda x:x.split('_')[0])

#check the first few rows to verify
print(file_index.head(5))

#check unique/number of gesture lables we have
print("Unique/Number of gesture labels: ",file_index['label'].unique())

# Save the updated file index with 3rd column name label
file_index.to_csv("file_index_with_labels.csv", index=False)
print("✅ Saved updated index with labels to file_index_with_labels.csv")

                                            filepath                filename
0  C:\Users\Talha\Documents\DataScience\Data Mini...  afternoon_apurve_1.txt
1  C:\Users\Talha\Documents\DataScience\Data Mini...  afternoon_apurve_2.txt
2  C:\Users\Talha\Documents\DataScience\Data Mini...  afternoon_apurve_3.txt
3  C:\Users\Talha\Documents\DataScience\Data Mini...  afternoon_apurve_4.txt
4  C:\Users\Talha\Documents\DataScience\Data Mini...  afternoon_apurve_5.txt
                                            filepath                filename  \
0  C:\Users\Talha\Documents\DataScience\Data Mini...  afternoon_apurve_1.txt   
1  C:\Users\Talha\Documents\DataScience\Data Mini...  afternoon_apurve_2.txt   
2  C:\Users\Talha\Documents\DataScience\Data Mini...  afternoon_apurve_3.txt   
3  C:\Users\Talha\Documents\DataScience\Data Mini...  afternoon_apurve_4.txt   
4  C:\Users\Talha\Documents\DataScience\Data Mini...  afternoon_apurve_5.txt   

       label  
0  afternoon  
1  afternoon  
2  afternoon

# Sequence level Features Extraction from each text file

In [7]:
import numpy as np
#this will read all the files in the folder and subfolders and find the file_index.csv file by itself
#print(glob.glob("**/file_index_with_labels.csv", recursive=True))

txt_files= pd.read_csv("combined/file_index_with_labels.csv")
txt_files.head()

#pick the first file path
first_file_path= txt_files["filepath"].iloc[0]
print("First File Path: ", first_file_path)

#load the first file as numpy array
data= np.loadtxt(first_file_path)
print("Shape of Data: ", data.shape)

#Display few Frame level features
data[:2]

#Calculate the statistical level features for all 60 columns
feature_dict={}

for i in range(data.shape[1]):
    feature_dict[f'col{i+1}_mean: ']= np.mean(data[:, i])
    feature_dict[f'col{i+2}_std: ']= np.std(data[:,i])
    feature_dict[f'col{i+3}_min: ']= np.min(data[:,i])
    feature_dict[f'col{i+4}_max: ']= np.max(data[:,i])
    


#Convert this into DataFrame (1 row of feature)
feature_df= pd.DataFrame([feature_dict])
print("Shape of feature_list: ",feature_df.shape)
print(feature_df.head(1))


First File Path:  C:\Users\Talha\Documents\DataScience\Data Mining\Project\DataMining-Coursework\combined\afternoon_apurve_1.txt
Shape of Data:  (64, 60)
Shape of feature_list:  (1, 240)
   col1_mean:   col2_std:   col3_min:   col4_max:   col2_mean:   col3_std:   \
0     -0.38381    0.003073   -0.387078   -0.376956     0.673528     0.00205   

   col4_min:   col5_max:   col3_mean:   col4_std:   ...  col60_min:   \
0    0.669575    0.678407     2.477884    0.001087  ...    -0.211996   

   col61_max:   col59_mean:   col60_std:   col61_min:   col62_max:   \
0    -0.206404     -0.937998     0.000981    -0.940838    -0.935661   

   col60_mean:   col61_std:   col62_min:   col63_max:   
0      2.647377       0.0034     2.633385     2.651656  

[1 rows x 240 columns]


# Now Extract all files Sequence Level Gestures

In [8]:
# Define feature extraction function to read text files

def extract_feature_from_file(filepath):

    df= pd.read_csv(filepath,header=None)
    #This tells pandas not to treat the first row as column headers.
    #Normally, pandas assumes the first line of a file contains the column names.
    #But in our .txt files, every line is just numeric data (like sensor readings) — there are no headers.
    feature_dict={}

    for i in range(data.shape[1]):
        feature_dict[f'col{i+1}_mean: ']= np.mean(data[:,i])
        feature_dict[f'col{i+2}_std: ']= np.std(data[:,i])
        feature_dict[f'col{i+3}_min: ']= np.min(data[:,i])
        feature_dict[f'col{i+4}_max: ']= np.max(data[:,i])

    return feature_dict

# Loop through all gesture files and extract features
all_feature_dict= []

    #.iterrows() function lets us go row by row through that DataFrame.
    #i → gives the index number (like 0, 1, 2, 3…)
    #row → gives that row’s content (like a small dictionary of column names and values).
for i,row in file_index.iterrows():
        #We extract the full file path from that row so we can open that specific file.
    file_path= row['filepath']
    label= row['label']
    features= extract_feature_from_file(file_path)
    features['label']= label
    all_feature_dict.append(features)


# combine into one DataFrame
feature_df= pd.DataFrame(all_feature_dict)

# Save the resulting dataset
output_path = os.path.join("combined", "sequence_level_features.csv")
features_df.to_csv(output_path, index=False)
print(f"✅ Feature extraction complete. Saved to: {output_path}")
print(f"Shape of final dataset: {features_df.shape}")

        

AttributeError: 'dict' object has no attribute 'append'