In [None]:
# Complete CSR AI Training & API Service

This notebook contains everything you need:
1. Upload your CSV datasets
2. Train the conversational AI model 
3. Start the API service on port 5004


In [9]:
# Install dependencies (CSV-only)
!pip install pandas flask flask-cors numpy scikit-learn ipywidgets


Defaulting to user installation because normal site-packages is not writeable


In [10]:
# Upload CSV files and imports
from IPython.display import display
import ipywidgets as widgets
from io import StringIO
import pandas as pd
import os
import logging, json, random, threading, time, re

# ML Libraries removed ‚Äî CSV-only mode
TRAINING_AVAILABLE = False
print("‚úÖ Running in CSV-only mode (no ML libraries).")

from flask import Flask, request, jsonify
from flask_cors import CORS

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Create datasets directory
os.makedirs('datasets', exist_ok=True)

print("üìÅ Upload your 3 CSV files (CSV-only mode):")
denial_upload = widgets.FileUpload(accept='.csv', description='Denial CSV')
member_upload = widgets.FileUpload(accept='.csv', description='Member CSV') 
plan_upload = widgets.FileUpload(accept='.csv', description='Plan CSV')

print("1. Denial reasons CSV:")
display(denial_upload)
print("2. Member subscription CSV:")
display(member_upload)  
print("3. Plan coverage CSV:")
display(plan_upload)

# Global data variables
denial_data = member_data = plan_data = None


‚úÖ Running in CSV-only mode (no ML libraries).
üìÅ Upload your 3 CSV files (CSV-only mode):
1. Denial reasons CSV:


FileUpload(value=(), accept='.csv', description='Denial CSV')

2. Member subscription CSV:


FileUpload(value=(), accept='.csv', description='Member CSV')

3. Plan coverage CSV:


FileUpload(value=(), accept='.csv', description='Plan CSV')

In [11]:
# CSV Loading with Encoding Support - NO HARDCODING!
def load_csv_with_encoding(file_path):
    """Load CSV with multiple encoding attempts - NO FALLBACK HARDCODED DATA"""
    encodings = ['utf-8', 'utf-8-sig', 'latin-1', 'cp1252', 'iso-8859-1']
    
    for encoding in encodings:
        try:
            logger.info(f"üìÅ Trying to load {file_path} with {encoding} encoding...")
            try:
                return pd.read_csv(file_path, encoding=encoding)
            except:
                return pd.read_csv(file_path, encoding=encoding, sep=',', quotechar='"', skipinitialspace=True)
        except UnicodeDecodeError:
            logger.warning(f"‚ö†Ô∏è  Failed with {encoding} encoding, trying next...")
            continue
        except Exception as e:
            logger.warning(f"‚ö†Ô∏è  Failed parsing {file_path} with {encoding}: {e}")
            continue
    
    # NO HARDCODED DATA - Must use uploaded datasets only
    raise Exception(f"‚ùå Could not load {file_path} with any supported encoding. Please check your CSV file format.")



def process_uploads():
    global denial_data, member_data, plan_data

    try:
        denial_data = pd.read_csv("datasets/denial_reason.csv")
        print(f"‚úÖ {len(denial_data)} denial codes loaded")
    except FileNotFoundError:
        print("‚ö†Ô∏è Missing: datasets/denial_reason.csv")

    try:
        member_data = pd.read_csv("datasets/member_subscription.csv")
        print(f"‚úÖ {len(member_data)} members loaded")
    except FileNotFoundError:
        print("‚ö†Ô∏è Missing: datasets/member_subscription.csv")

    try:
        plan_data = pd.read_csv("datasets/plan_coverage.csv")
        print(f"‚úÖ {len(plan_data)} plans loaded")
    except FileNotFoundError:
        print("‚ö†Ô∏è Missing: datasets/plan_coverage.csv")

    return all([denial_data is not None, member_data is not None, plan_data is not None])
 

# Training Conversation Generation (EXACT same as Python files - NO HARDCODING)
def generate_denial_conversations():
    """Generate conversations about denial codes from your data"""
    conversations = []
    
    for _, row in denial_data.iterrows():
        user_code = row['user_code']
        denial_code = row['denial_code']
        description = row['description']
        action = row['suggested_action']
        
        # Generate various natural ways users might ask about this denial
        questions = [
            f"What does denial code {denial_code} mean?",
            f"Why was my claim rejected with code {denial_code}?",
            f"Can you explain {user_code}{denial_code}?",
            f"I got a denial with code {denial_code}, what should I do?",
            f"What is the reason for denial code {denial_code}?",
            f"Help me understand why code {denial_code} was used",
            f"My claim shows {user_code} {denial_code}, what's wrong?",
            f"Explain the denial code {user_code}{denial_code}",
            f"What does code {denial_code} indicate?",
            f"I received denial {denial_code}, what's the next step?"
        ]
        
        for question in questions:
            conversation_text = f"User: {question}\nAssistant: Denial code {denial_code} ({user_code}) means: {description}\n\nRecommended action: {action}<|endoftext|>"
            conversations.append(conversation_text)
    
    return conversations

def generate_plan_conversations():
    """Generate conversations from pure CSV data without hardcoded questions"""
    conversations = []
    
    # Get member data to create realistic plan queries
    for _, member in member_data.iterrows():
        plan_info = plan_data[plan_data['plan_id'] == member['plan_id']]
        if not plan_info.empty:
            plan_row = plan_info.iloc[0]
            
            # Natural questions about plan coverage
            questions = [
                f"Is dental covered for member {member['member_id']}?",
                f"What's the copay for {member['member_name']}?",
                f"Does {member['member_id']} have vision coverage?",
                f"Tell me about {member['member_name']}'s plan",
                f"What services are covered for member {member['member_id']}?",
                f"What's the coverage for {member['member_name']}?",
                f"Check {member['member_id']} plan benefits",
                f"Show plan details for {member['member_name']}",
                f"What does {member['member_id']}'s plan cover?",
                f"Is emergency care covered for {member['member_name']}?"
            ]
            
            for question in questions:
                # Provide raw data context - let AI model learn to interpret and answer
                data_context = f"""Member: {member['member_name']} ({member['member_id']})
Status: {member['status']}
Plan ID: {member['plan_id']}
Coverage Period: {member['effective_date']} to {member['end_date']}
Coverage Type: {plan_row['coverage_type']}
Covered Services: {plan_row['covered_services']}
Copay: {plan_row['copay']}
Notes: {plan_row['notes']}"""
                
                # Create direct conversation text format - AI learns from data context
                conversation_text = f"User: {question}\nAssistant: {data_context}<|endoftext|>"
                conversations.append(conversation_text)
    
    return conversations

def generate_member_conversations():
    """Generate conversations about member information"""
    conversations = []
    
    for _, member in member_data.iterrows():
        member_id = member['member_id']
        name = member['member_name']
        plan_id = member['plan_id']
        status = member['status']
        effective_date = member['effective_date']
        end_date = member['end_date']
        
        # Find plan details for this member
        plan_info = plan_data[plan_data['plan_id'] == plan_id]
        plan_details = ""
        if not plan_info.empty:
            plan_row = plan_info.iloc[0]
            plan_details = f"Their {plan_row['coverage_type']} plan covers: {plan_row['covered_services']}"
        
        questions = [
            f"Show me details for member {member_id}",
            f"What plan is {name} on?",
            f"Give me information about {member_id}",
            f"Tell me about {name}",
            f"What's {member_id} member status?",
            f"When does {name}'s coverage end?",
            f"Is member {member_id} active?",
            f"Find {name} in the system",
            f"Look up member {member_id}",
            f"What coverage does {name} have?"
        ]
        
        for question in questions:
            # Provide raw data context - let AI model learn to interpret and answer
            data_context = f"""Member: {name} ({member_id})
Status: {status}
Plan ID: {plan_id}
Coverage Period: {effective_date} to {end_date}
Plan Details: {plan_details}"""
            
            # Create direct conversation text format - AI learns from data context
            conversation_text = f"User: {question}\nAssistant: {data_context}<|endoftext|>"
            conversations.append(conversation_text)
    
    return conversations

def generate_complex_conversations():
    """Generate complex multi-entity conversations"""
    conversations = []
    
    # Complex queries combining member + plan + denial
    for _, member in member_data.iterrows():
        plan_info = plan_data[plan_data['plan_id'] == member['plan_id']]
        if not plan_info.empty:
            plan_row = plan_info.iloc[0]
            
            # Member + plan questions
            questions = [
                f"What benefits does {member['member_name']} have?",
                f"Is dental covered for member {member['member_id']}?",
                f"What's {member['member_name']}'s copay for emergency?",
                f"Does {member['member_id']} have vision coverage?",
                f"Tell me about {member['member_name']}'s plan coverage"
            ]
            
            for question in questions:
                # Provide raw data context - let AI model learn to interpret and answer
                data_context = f"""Member: {member['member_name']} ({member['member_id']})
Status: {member['status']}
Plan ID: {member['plan_id']}
Coverage Period: {member['effective_date']} to {member['end_date']}
Coverage Type: {plan_row['coverage_type']}
Covered Services: {plan_row['covered_services']}
Copay: {plan_row['copay']}
Notes: {plan_row['notes']}"""
                
                # Create direct conversation text format - AI learns from data context
                conversation_text = f"User: {question}\nAssistant: {data_context}<|endoftext|>"
                conversations.append(conversation_text)
    
    return conversations

def generate_training_conversations():
    """Generate natural conversation data from your CSV files"""
    conversations = []
    
    # Generate denial code conversations
    conversations.extend(generate_denial_conversations())
    
    # Generate plan coverage conversations
    conversations.extend(generate_plan_conversations())
    
    # Generate member lookup conversations
    conversations.extend(generate_member_conversations())
    
    # Generate mixed/complex conversations
    conversations.extend(generate_complex_conversations())
    
    logger.info(f"üéØ Generated {len(conversations)} training conversations")
    return conversations

def train_conversational_model(model_name=None):
    """Model training removed. Running in CSV-only mode."""
    logger.info("üõë Training disabled ‚Äî CSV-only mode.")
    return None

# Process uploads (CSV-only mode; training disabled)
if process_uploads():
    print("‚úÖ CSVs loaded. Skipping model training (CSV-only mode).")
else:
    print("‚ö†Ô∏è Upload all 3 CSV files first, then re-run this cell")


‚úÖ 415 denial codes loaded
‚úÖ 10000 members loaded
‚úÖ 100 plans loaded
‚úÖ CSVs loaded. Skipping model training (CSV-only mode).


In [12]:
# === Trained Model API (Notebook cell) ‚Äî same structure/behavior as trained_model_api.py ===

# Imports
import os
import re
import time
import json
import threading
import logging
from datetime import datetime

import pandas as pd
from flask import Flask, request, jsonify
from flask_cors import CORS

# Resolve datasets absolute path
BASE_DIR = os.getcwd()
DATASETS_DIR = os.path.join(BASE_DIR, 'datasets')

# ML Libraries removed ‚Äî CSV-only mode
ML_AVAILABLE = False

# Flask app
app = Flask(__name__)
CORS(app)

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class TrainedCSRModelService:
	"""Service that loads and serves your trained CSR model"""

	def __init__(self):
		self.model = None
		self.tokenizer = None
		self.denial_data = None
		self.member_data = None
		self.plan_data = None

		# Load your CSV data for context
		self._load_csv_data()

		# Model loading removed ‚Äî CSV-only mode
		# self._load_trained_model()

	def _load_csv_data(self):
		"""Load your CSV data for direct lookups"""
		try:
			self.denial_data = self._load_csv_with_encoding(os.path.join(DATASETS_DIR, 'denial_reason.csv'))
			self.member_data = self._load_csv_with_encoding(os.path.join(DATASETS_DIR, 'member_subscription.csv'))
			self.plan_data = self._load_csv_with_encoding(os.path.join(DATASETS_DIR, 'plan_coverage.csv'))
			logger.info(f"‚úÖ Loaded CSV data: {len(self.denial_data)} denials, {len(self.member_data)} members, {len(self.plan_data)} plans")
		except Exception as e:
			logger.error(f"‚ùå Error loading CSV data: {e}")

	def _load_csv_with_encoding(self, file_path):
		"""Load CSV with multiple encoding attempts and handle formatting issues"""
		encodings = ['utf-8', 'utf-8-sig', 'latin-1', 'cp1252', 'iso-8859-1']

		for encoding in encodings:
			try:
				# Try different CSV parsing options
				try:
					return pd.read_csv(file_path, encoding=encoding)
				except:
					return pd.read_csv(file_path, encoding=encoding, sep=',', quotechar='"', skipinitialspace=True)
			except UnicodeDecodeError:
				continue
			except Exception:
				continue


		raise Exception(f"Could not load {file_path} with any supported encoding")

	def _load_trained_model(self):
		"""Load your custom trained model"""
		if not ML_AVAILABLE:
			logger.warning("‚ö†Ô∏è  ML libraries not available. Skipping model load.")
			return False

		model_paths = ['./csr_bot_model', './trained_csr_bot']

		# Try to load your custom trained model
		for model_path in model_paths:
			if os.path.exists(model_path):
				try:
					logger.info(f"ü§ñ Loading your custom trained model from {model_path}...")
					self.tokenizer = AutoTokenizer.from_pretrained(model_path)
					self.model = AutoModelForCausalLM.from_pretrained(model_path)
					logger.info("‚úÖ Custom trained model loaded successfully!")
					return True
				except Exception as e:
					logger.warning(f"‚ö†Ô∏è  Could not load custom model from {model_path}: {e}")

		# Fallback: no model, rely on direct CSV + gentle fallbacks
		logger.warning("‚ö†Ô∏è  No custom trained model found. Using direct CSV lookup only")
		return False

	def process_query(self, user_query, query_type=None):
		"""Process user query using CSV data only (no model)"""
		try:
			# First try direct CSV lookup for exact matches
			direct_response = self._direct_csv_lookup(user_query, query_type)
			if direct_response:
				return {
					'success': True,
					'response': direct_response,
					'source': 'direct_csv_lookup',
					'confidence': 0.95
				}

			# Model path removed; using CSV-only and fallback responses

			# Fallback response
			return {
				'success': True,
				'response': self._get_fallback_response(user_query),
				'source': 'fallback',
				'confidence': 0.5
			}

		except Exception as e:
			logger.error(f"‚ùå Error processing query: {e}")
			return {
				'success': False,
				'error': str(e),
				'response': 'I encountered an error processing your request. Please try again.'
			}

	def _direct_csv_lookup(self, query, query_type=None):
		"""Direct lookup in your CSV data"""
		query_lower = query.lower()

		# Denial code lookup
		if query_type == 'denial' or any(word in query_lower for word in ['denial', 'code', 'reject', 'denied']):
			# Extract potential codes from query using regex
			# Look for patterns like CO-45, PR96, 204, etc.
			codes = re.findall(r'\b([A-Za-z]{1,2})-?(\d{1,3})\b', query, flags=re.I)

			# Match against CSV
			for uc, dc in codes:
				match = self.denial_data[
					(self.denial_data['user_code'].str.upper() == uc.upper()) &
					(self.denial_data['denial_code'].astype(str).str.upper() == str(dc).upper())
				]
				if not match.empty:
					row = match.iloc[0]
					return {
						'type': 'denial_explanation',
						'user_code': str(row['user_code']),
						'denial_code': str(row['denial_code']),
						'description': str(row['description']),
						'suggested_action': str(row['suggested_action'])
					}

			# If numeric-only provided (e.g., "45")
			only_nums = re.findall(r'\b(\d{1,3})\b', query)
			for num in only_nums:
				subset = self.denial_data[self.denial_data['denial_code'].astype(str) == num]
				if len(subset) == 1:
					row = subset.iloc[0]
					return {
						'type': 'denial_explanation',
						'user_code': str(row['user_code']),
						'denial_code': str(row['denial_code']),
						'description': str(row['description']),
						'suggested_action': str(row['suggested_action'])
					}

		# Member lookup / coverage lookup
		if query_type == 'member' or any(word in query_lower for word in ['member', 'patient', 'covered', 'coverage', 'plan']):
			# Try to extract member id like M12345
			mid = re.search(r'\b[Mm]\d+\b', query)
			if mid:
				member_id = mid.group(0).upper()
				member_rows = self.member_data[self.member_data['member_id'].astype(str).str.upper() == member_id]
				if not member_rows.empty:
					member = member_rows.iloc[0]
					plan_info = self.plan_data[self.plan_data['plan_id'] == member['plan_id']]
					plan_details = {}
					if not plan_info.empty:
						plan_row = plan_info.iloc[0]
						plan_details = {
							'coverage_type': str(plan_row.get('coverage_type', '')),
							'covered_services': str(plan_row.get('covered_services', '')),
							'copay': str(plan_row.get('copay', '')),
							'notes': str(plan_row.get('notes', ''))
						}

					coverage_answer = self._analyze_coverage_question(query_lower, plan_info.iloc[0] if not plan_info.empty else {'covered_services': ''})
					return {
						'type': 'member_coverage',
						'member_id': str(member.get('member_id', '')),
						'member_name': str(member.get('member_name', '')),
						'plan_id': str(member.get('plan_id', '')),
						'status': str(member.get('status', '')),
						'effective_date': str(member.get('effective_date', '')),
						'end_date': str(member.get('end_date', '')),
						'coverage_answer': coverage_answer,
						'plan_details': plan_details
					}

			# Try plan id in query
			for _, plan in self.plan_data.iterrows():
				pid = str(plan['plan_id'])
				if pid.lower() in query_lower:
					coverage_answer = self._analyze_coverage_question(query_lower, plan)
					return {
						'type': 'plan_coverage',
						'plan_id': str(plan['plan_id']),
						'coverage_type': str(plan['coverage_type']),
						'covered_services': str(plan['covered_services']),
						'copay': str(plan['copay']),
						'notes': str(plan['notes']),
						'coverage_answer': coverage_answer
					}

		return None

	def _analyze_coverage_question(self, query_lower, plan_row):
		"""Generate exact coverage answers using smart logic"""
		services = str(plan_row.get('covered_services', '')).lower()

		# Find asked service (e.g., "is dental covered")
		m = re.search(r'is\s+([a-z\s]+?)\s+covered', query_lower, flags=re.I)
		if m:
			asked = m.group(1).strip().lower()
			if asked:
				return f"Yes, {asked} is covered." if asked in services else f"No, {asked} is not covered."

		# Copay questions
		if any(word in query_lower for word in ['copay', 'cost', 'pay', 'charge', 'fee']):
			return f"Copay: {plan_row.get('copay', '')}"

		return f"This plan covers: {plan_row.get('covered_services', '')}"

	def _generate_with_trained_model(self, user_query):
		"""Model removed ‚Äî CSV-only mode."""
		return {
			'type': 'generated_response',
			'response': 'Model disabled. Using CSV-based response only.'
		}
	def _get_fallback_response(self, query):
		"""Fallback response when no specific match found"""
		query_lower = query.lower()

		if any(word in query_lower for word in ['denial', 'code', 'reject']):
			available_codes = ', '.join([f"{row['user_code']}{row['denial_code']}" for _, row in self.denial_data.iterrows()])
			return {
				'type': 'help_response',
				'message': f"I can help explain denial codes. Available codes: {available_codes}. Please specify which code you'd like to know about."
			}

		elif any(word in query_lower for word in ['member', 'patient']):
			available_members = ', '.join([f"{row['member_name']} ({row['member_id']})" for _, row in self.member_data.iterrows()])
			return {
				'type': 'help_response',
				'message': f"I can look up member information. Available members: {available_members}. Please specify which member you'd like to find."
			}

		elif any(word in query_lower for word in ['plan', 'coverage']):
			available_plans = ', '.join([row['plan_id'] for _, row in self.plan_data.iterrows()])
			return {
				'type': 'help_response',
				'message': f"I can provide plan coverage information. Available plans: {available_plans}. Please specify which plan you're asking about."
			}

		return {
			'type': 'general_help',
			'message': "I can help you with denial code explanations, member information lookups, and plan coverage questions. Please ask me about any of these topics."
		}

# Initialize the service
try:
	model_service = TrainedCSRModelService()
	logger.info("‚úÖ Trained CSR Model Service initialized successfully")
except Exception as e:
	logger.error(f"‚ùå Failed to initialize service: {e}")
	model_service = None

# Flask endpoints
@app.route('/query', methods=['POST'])
def process_query():
	"""Main query endpoint for Spring Boot backend to call"""
	try:
		if not model_service:
			return jsonify({'success': False, 'error': 'Model service is not available'}), 500

		data = request.get_json()
		user_query = (data or {}).get('query', '').strip()
		query_type = (data or {}).get('type', None)  # Optional: 'denial', 'member', 'coverage'

		if not user_query:
			return jsonify({'success': False, 'error': 'Please provide a query'}), 400

		logger.info(f"üîç Processing query: {user_query} (type: {query_type})")

		# Process the query
		result = model_service.process_query(user_query, query_type)

		logger.info(f"‚úÖ Response generated from {result.get('source', 'unknown')}")
		return jsonify(result)

	except Exception as e:
		logger.error(f"Query endpoint error: {e}")
		return jsonify({'success': False, 'error': 'Failed to process query'}), 500

@app.route('/health', methods=['GET'])
def health_check():
	"""Health check endpoint"""
	return jsonify({
		'status': 'healthy',
		'service': 'CSR CSV Lookup API',
		'timestamp': datetime.now().isoformat(),
		'model_loaded': bool(model_service and model_service.model),
		'data_loaded': bool(model_service and all([
			model_service.denial_data is not None,
			model_service.member_data is not None,
			model_service.plan_data is not None
		]))
	})

@app.route('/train-status', methods=['GET'])
def train_status():
	"""Check if model is trained or needs training"""
	model_paths = ['./csr_bot_model', './trained_csr_bot']
	trained_model_exists = any(os.path.exists(path) for path in model_paths)

	return jsonify({
		'trained_model_exists': False,
		'model_loaded': False,
		'csv_data_available': all([
			os.path.exists(os.path.join(DATASETS_DIR, 'denial_reason.csv')),
			os.path.exists(os.path.join(DATASETS_DIR, 'member_subscription.csv')),
			os.path.exists(os.path.join(DATASETS_DIR, 'plan_coverage.csv'))
		]),
		'recommendation': 'csv_only'
	})

@app.route('/available-data', methods=['GET'])
def available_data():
	"""Get summary of available data"""
	if not model_service:
		return jsonify({'error': 'Service not available'}), 500

	return jsonify({
		'denial_codes': [
			{
				'code': f"{row['user_code']}{row['denial_code']}",
				'description': row['description']
			}
			for _, row in model_service.denial_data.iterrows()
		] if model_service.denial_data is not None else [],
		'members': [
			{
				'id': row['member_id'],
				'name': row['member_name'],
				'plan': row['plan_id']
			}
			for _, row in model_service.member_data.iterrows()
		] if model_service.member_data is not None else [],
		'plans': [
			{
				'id': row['plan_id'],
				'type': row['coverage_type'],
				'services': row['covered_services']
			}
			for _, row in model_service.plan_data.iterrows()
		] if model_service.plan_data is not None else []
	})

# Start the API server in a background thread and keep the cell alive
def run_api():
	print("üöÄ Starting CSR CSV Lookup API on http://localhost:5004")
	print("üì° Endpoints:")
	print("   POST /query")
	print("   GET  /health")
	print("   GET  /train-status")
	print("   GET  /available-data")
	app.run(host='0.0.0.0', port=5004, debug=False, use_reloader=False)

api_thread = threading.Thread(target=run_api, daemon=True)
api_thread.start()

print("üéâ API service started! Keep this cell running.")
print("Your Spring Boot backend can connect to: http://localhost:5004")

# No blocking loop in CSV-only mode
time.sleep(2)
print("‚úÖ API thread running (non-blocking).")

INFO:__main__:‚úÖ Loaded CSV data: 415 denials, 10000 members, 100 plans
INFO:__main__:‚úÖ Trained CSR Model Service initialized successfully


üöÄ Starting CSR CSV Lookup API on http://localhost:5004
üì° Endpoints:
   POST /query
   GET  /health
   GET  /train-status
   GET  /available-data
üéâ API service started! Keep this cell running.
Your Spring Boot backend can connect to: http://localhost:5004
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5004
 * Running on http://192.168.1.113:5004
INFO:werkzeug:[33mPress CTRL+C to quit[0m


‚úÖ API thread running (non-blocking).
