In [None]:
import numpy as np
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
#You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import random
import os

In [None]:
data = pd.read_csv('/kaggle/input/ml-hackathon-ec-campus-set-3-alt/train/text.csv', encoding='ISO-8859-1')
# Define path to video clips
video_dir = '/kaggle/input/ml-hackathon-ec-campus-set-3-alt/train/videos'


# Function to get video file path from IDs
def get_video_clip_path(row):
    dialogue_id = row['Dialogue_ID']
    utterance_id = row['Utterance_ID']
    filename = f"dia{dialogue_id}_utt{utterance_id}.mp4"
    return os.path.join(video_dir, filename)

# Apply the function to get file paths for each sampled clip
data['video_clip_path'] = data.apply(get_video_clip_path, axis=1)

# Check sample paths
print(data[['Dialogue_ID', 'Utterance_ID', 'video_clip_path']].head())

In [None]:
data.shape

In [None]:
# Define path to video clips
df = pd.read_csv('/kaggle/input/ml-hackathon-ec-campus-set-3-alt/test/text.csv', encoding='ISO-8859-1')
video_dir = '/kaggle/input/ml-hackathon-ec-campus-set-3-alt/test/videos'


# Function to get video file path from IDs
def get_video_clip_path(row):
    dialogue_id = row['Dialogue_ID']
    utterance_id = row['Utterance_ID']
    filename = f"dia{dialogue_id}_utt{utterance_id}.mp4"
    return os.path.join(video_dir, filename)

# Apply the function to get file paths for each sampled clip
df['video_clip_path'] = df.apply(get_video_clip_path, axis=1)

# Check sample paths
print(df[['Dialogue_ID', 'Utterance_ID', 'video_clip_path']].head())

In [None]:
data.head()

In [None]:
# Checking for missing values in the dataset
missing_values = data.isnull().sum()

# Basic text cleaning on 'Utterance' column: removing any stray characters or encoding issues
data['Utterance'] = data['Utterance'].str.replace(r'[^\x00-\x7F]+', '', regex=True)

# Checking the unique values in categorical columns for consistency
unique_values = {
    "Speaker": data['Speaker'].unique(),
    "Sentiment": data['Sentiment'].unique()
}

missing_values, unique_values


In [None]:
from textblob import TextBlob

# Text-based feature extraction on 'Utterance' column
data['word_count'] = data['Utterance'].apply(lambda x: len(x.split()))
data['char_count'] = data['Utterance'].apply(len)

# Sentiment polarity as a feature (using TextBlob)
data['text_sentiment'] = data['Utterance'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Display the first few rows with new features
data[['Utterance', 'word_count', 'char_count', 'text_sentiment']].head()


In [None]:
from datetime import datetime

# Convert StartTime and EndTime to duration in seconds
def convert_to_seconds(time_str):
    # Convert "HH:MM:SS,ms" to total seconds
    try:
        time_obj = datetime.strptime(time_str, '%H:%M:%S,%f')
        return time_obj.hour * 3600 + time_obj.minute * 60 + time_obj.second + time_obj.microsecond / 1e6
    except ValueError:
        # Handle cases with shorter time format, e.g., "H:MM:SS,ms"
        time_obj = datetime.strptime(time_str, '%H:%M:%S,%f')
        return time_obj.hour * 3600 + time_obj.minute * 60 + time_obj.second + time_obj.microsecond / 1e6

# Applying the conversion to StartTime and EndTime columns
data['StartTime_sec'] = data['StartTime'].apply(convert_to_seconds)
data['EndTime_sec'] = data['EndTime'].apply(convert_to_seconds)

# Calculating the duration of each utterance
data['duration_sec'] = data['EndTime_sec'] - data['StartTime_sec']

# Display the first few rows with the new duration feature
data[['StartTime', 'EndTime', 'StartTime_sec', 'EndTime_sec', 'duration_sec']].head()


In [None]:
from gensim.models import Word2Vec
import numpy as np

# Tokenize the Utterances for Word2Vec (splitting each utterance into words)
data['tokenized_utterance'] = data['Utterance'].apply(lambda x: x.lower().split())

# Train a Word2Vec model on the tokenized utterances
word2vec_model = Word2Vec(sentences=data['tokenized_utterance'], vector_size=100, window=5, min_count=1, sg=1, epochs=10)

# Function to get the average Word2Vec embedding for an utterance
def get_avg_word2vec(tokens, model, vector_size=100):
    # Filter words that are in the model's vocabulary
    valid_words = [token for token in tokens if token in model.wv.key_to_index]
    # If there are valid words, calculate the mean; otherwise, return a zero vector
    if valid_words:
        return np.mean(model.wv[valid_words], axis=0)
    else:
        return np.zeros(vector_size)

# Apply the function to each tokenized utterance
data['w2v_embedding'] = data['tokenized_utterance'].apply(lambda x: get_avg_word2vec(x, word2vec_model))

# Display the first few Word2Vec embeddings
data[['Utterance', 'w2v_embedding']].head()


In [None]:
import tensorflow as tf
print(tf.__version__)


In [None]:
import cv2
import os

# Define the directory containing the videos and the output directory for frames
video_dir = '/kaggle/input/ml-hackathon-ec-campus-set-3-alt/train/videos'  # Update this path
output_dir = '/kaggle/working/extracted_frames'
os.makedirs(output_dir, exist_ok=True)

# Parameters
frame_interval = 10  # Extract every 10th frame
batch_size = 50      # Process 50 videos per batch for memory efficiency

# Initialize a list to keep track of paths of saved frames
extracted_frames = {}

# Function to process a batch of videos
def process_video_batch(video_files):
    for video_path in video_files:
        video_name = os.path.basename(video_path).split('.')[0]  # File name without extension
        cap = cv2.VideoCapture(video_path)
        frame_count = 0
        frame_list = []
        
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            if frame_count % frame_interval == 0:
                # Resize and save frame
                frame_resized = cv2.resize(frame, (224, 224))
                frame_filename = f"{video_name}_frame{frame_count}.jpg"
                frame_path = os.path.join(output_dir, frame_filename)
                cv2.imwrite(frame_path, frame_resized)
                frame_list.append(frame_path)
                
            frame_count += 1
        
        cap.release()
        extracted_frames[video_name] = frame_list

# Main loop to process all videos in batches
all_video_files = [os.path.join(video_dir, f) for f in os.listdir(video_dir) if f.endswith('.mp4')]
for i in range(0, len(all_video_files), batch_size):
    batch_files = all_video_files[i:i + batch_size]
    process_video_batch(batch_files)

# Display paths to verify
print("Frames have been extracted and saved. Example paths:")
for video, frames in list(extracted_frames.items())[:5]:  # Display only the first few entries for brevity
    print(f"Video: {video}, Frames: {frames[:3]}")  # Show first 3 frames for each video


In [None]:
import cv2
import numpy as np
import os
import pandas as pd
import time
from scipy.ndimage import convolve
from skimage.color import rgb2gray

# Define directory paths
video_dir = '/kaggle/input/ml-hackathon-ec-campus-set-3-alt/train/videos'  
output_file = '/kaggle/working/video_features.csv'
frame_interval = 10
batch_size = 50  

# Define a simple CNN with two convolutional layers and pooling
def simple_cnn_feature_extractor(image):
    image = rgb2gray(image)  # Convert to grayscale
    conv1_kernel = np.array([[[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]]])
    conv1_output = convolve(image, conv1_kernel[0], mode='reflect')
    pooled1 = conv1_output[::2, ::2]  # 2x2 pooling

    conv2_kernel = np.array([[[1, 0, -1], [0, 0, 0], [-1, 0, 1]]])
    conv2_output = convolve(pooled1, conv2_kernel[0], mode='reflect')
    pooled2 = conv2_output[::2, ::2]  # 2x2 pooling again

    return pooled2.flatten()  # Flatten for feature vector

# Extract features for each video and average features across frames
def extract_video_features_with_custom_cnn(video_file):
    cap = cv2.VideoCapture(video_file)
    frame_features = []
    frame_count = 0

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        if frame_count % frame_interval == 0:
            frame_resized = cv2.resize(frame, (64, 64))
            features = simple_cnn_feature_extractor(frame_resized)
            frame_features.append(features)
        frame_count += 1

    cap.release()
    return np.mean(frame_features, axis=0) if frame_features else np.zeros(256)

# Function to process videos in batches and save features incrementally
def process_video_batch(video_files):
    batch_start_time = time.time()  # Start timing the batch
    video_features_list = []
    
    for video_path in video_files:
        video_name = os.path.basename(video_path).split('.')[0]
        features = extract_video_features_with_custom_cnn(video_path)
        video_features_list.append((video_name, features))

    # Convert to DataFrame for easy saving
    batch_df = pd.DataFrame(video_features_list, columns=['video_file', 'features'])
    batch_df['features'] = batch_df['features'].apply(lambda x: x.tolist())  # Convert numpy array to list
    batch_df.to_csv(output_file, mode='a', index=False, header=not os.path.exists(output_file))
    
    batch_end_time = time.time()  # End timing the batch
    print(f"Processed batch of {len(video_files)} videos in {batch_end_time - batch_start_time:.2f} seconds.")

# Main loop to process all videos in batches
total_start_time = time.time()  # Start timing the entire process
all_video_files = [os.path.join(video_dir, f) for f in os.listdir(video_dir) if f.endswith('.mp4')]

for i in range(0, len(all_video_files), batch_size):
    batch_files = all_video_files[i:i + batch_size]
    process_video_batch(batch_files)
    print(f"Completed batch {i // batch_size + 1} of {len(all_video_files) // batch_size + 1}")

total_end_time = time.time()  # End timing the entire process
print(f"Total processing time for all videos: {total_end_time - total_start_time:.2f} seconds.")


In [None]:
import pandas as pd
import numpy as np

# Load video features
video_features_df = pd.read_csv('/kaggle/working/video_features.csv', converters={'features': eval})

# Ensure the feature vectors are numpy arrays
video_features_df['features'] = video_features_df['features'].apply(lambda x: np.array(x))

# Display the dataframe structure to confirm loading
print("Video features loaded. Structure:")
print(video_features_df.head())


In [None]:
print("Available columns in `data`:", data.columns)

In [None]:
import pandas as pd
import numpy as np

# Select relevant text features and embeddings from `data`
text_features_df = data[['video_clip_path', 'word_count', 'char_count', 'text_sentiment', 'duration_sec', 'w2v_embedding']].copy()

# Combine structural features into a list for easier concatenation later
text_features_df['structural_features'] = text_features_df[['word_count', 'char_count', 'text_sentiment', 'duration_sec']].values.tolist()

# Ensure Word2Vec embeddings are in array format (if not already)
text_features_df['w2v_embedding'] = text_features_df['w2v_embedding'].apply(lambda x: np.array(x) if not isinstance(x, np.ndarray) else x)

# Display the first few rows of the text features DataFrame
print("Text features DataFrame:")
print(text_features_df[['video_clip_path', 'structural_features', 'w2v_embedding']].head())


In [None]:
# Update `text_features_df` to use `video_file` for consistency
text_features_df['video_file'] = text_features_df['video_clip_path'].apply(lambda x: x.split('/')[-1].split('.')[0])

# Merge `text_features_df` and `video_features_df` on `video_file`
combined_features_df = pd.merge(text_features_df, video_features_df, on='video_file', how='inner')

# Verify the structure and contents of the merged DataFrame
print("combined_features_df structure after merging:")
print(combined_features_df.info())
print(combined_features_df.head())


In [None]:
# Check `video_features_df` structure and contents
print("video_features_df structure:")
print(video_features_df.info())
print(video_features_df.head())



In [None]:
# Standardize `video_file` in `text_features_df` by extracting the base file name
text_features_df['video_file'] = text_features_df['video_file'].apply(lambda x: x.split('/')[-1].split('.')[0])

# Verify that the `video_file` columns now match in both DataFrames
print("Sample video_file values in text_features_df:", text_features_df['video_file'].head())
print("Sample video_file values in video_features_df:", video_features_df['video_file'].head())

# Merge `text_features_df` and `video_features_df` on 'video_file'
combined_features_df = pd.merge(text_features_df, video_features_df, on='video_file', how='inner')

# Verify the structure and contents of the merged DataFrame
print("combined_features_df structure after merging:")
print(combined_features_df.info())
print(combined_features_df.head())


In [None]:
import numpy as np

# Concatenate all features into a single vector, ensuring the result is a 1D array
combined_features_df['combined_features'] = combined_features_df.apply(
    lambda row: np.concatenate([
        np.array(row['structural_features']), 
        np.array(row['w2v_embedding']), 
        np.array(row['features'])
    ]).ravel(), 
    axis=1
)

# Verify the structure of `combined_features`
print("Combined features DataFrame structure:")
print(combined_features_df[['video_file', 'combined_features']].head())
print("Shape of each combined feature vector:", combined_features_df['combined_features'].iloc[0].shape)


In [None]:
# Assuming `text_sentiment` is a polarity score, categorize it
def categorize_sentiment(polarity):
    if polarity > 0.1:
        return 'positive'
    elif polarity < -0.1:
        return 'negative'
    else:
        return 'neutral'

combined_features_df['sentiment_label'] = combined_features_df['text_sentiment'].apply(categorize_sentiment)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# Prepare features (X) and target labels (y) using the new categorical sentiment label
X = np.stack(combined_features_df['combined_features'].values)
y = combined_features_df['sentiment_label']  # Categorical labels

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train a Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))


In [None]:
# Assuming your DataFrame is called 'combined_features_df'
print(combined_features_df.columns)
print(combined_features_df.head())

In [None]:
# Extracting the sentiment labels and Sr No. from the test data
all_ids = test_df["Sr No."]
sentiments = combined_features_df['sentiment_label']  # Use the 'sentiment_label' column

# Create the DataFrame for submission
submission_df = pd.DataFrame({
    'Sr No.': all_ids,
    'Sentiment': sentiments
})

# Save the DataFrame to a CSV file
submission_df.to_csv("submission.csv", index=False)

print("Submission file 'submission.csv' created successfully.")
